From ac2651a1e846e5ecb7d6c60382e7038ec49ca0a0 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Thu, 8 Nov 2018 16:50:28 -0800 Subject: [PATCH 01/59] temp enable btl uct --- contrib/platform/mellanox/optimized | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/platform/mellanox/optimized b/contrib/platform/mellanox/optimized index f49a0576c64..339518e483e 100644 --- a/contrib/platform/mellanox/optimized +++ b/contrib/platform/mellanox/optimized @@ -1,4 +1,4 @@ -enable_mca_no_build=coll-ml,btl-uct +enable_mca_no_build=coll-ml enable_debug_symbols=yes enable_orterun_prefix_by_default=yes with_verbs=no From 4ad6e0b0094af47ac457a53991c71d804885d980 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 12 Nov 2018 06:00:33 -0800 Subject: [PATCH 02/59] add data structures --- opal/mca/common/ucx/common_ucx.c | 68 ++++++++++++++++++++++++++++++++ opal/mca/common/ucx/common_ucx.h | 38 ++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 84e26b221d3..f05a7879154 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -19,6 +19,74 @@ /***********************************************************************/ +typedef struct { + opal_mutex_t mutex; + ucp_worker_h worker; + ucp_ep_h *endpoints; + int comm_size; +} _worker_engine_t; + +OBJ_CLASS_DECLARATION(_worker_engine_t); + +typedef struct { + int ctx_id; + opal_common_ucx_ctx_t *gctx; + _worker_engine_t *worker; +} _tlocal_ctx_t; + +OBJ_CLASS_DECLARATION(_tlocal_ctx_t); + +typedef struct { + _worker_engine_t *worker; + ucp_rkey_h *rkeys; +} _mem_info_t; + +OBJ_CLASS_DECLARATION(_mem_info_t); + +typedef struct { + int mem_id; + opal_common_ucx_mem_t *gmem; + _mem_info_t *mem; +} _tlocal_mem_t; + +OBJ_CLASS_DECLARATION(_tlocal_mem_t); + +typedef struct { + opal_list_item_t super; + _worker_engine_t *ptr; +} _idle_list_item_t; + +OBJ_CLASS_DECLARATION(_idle_list_item_t); +OBJ_CLASS_INSTANCE(_idle_list_item_t, opal_list_item_t, NULL, NULL); + +typedef struct { + opal_list_item_t super; + _worker_engine_t *ptr; +} _worker_list_item_t; + +OBJ_CLASS_DECLARATION(_worker_list_item_t); +OBJ_CLASS_INSTANCE(_worker_list_item_t, opal_list_item_t, NULL, NULL); + +typedef struct { + opal_list_item_t super; + _mem_info_t *ptr; +} _mem_region_list_item_t; + +OBJ_CLASS_DECLARATION(_mem_region_list_item_t); +OBJ_CLASS_INSTANCE(_mem_region_list_item_t, opal_list_item_t, NULL, NULL); + +/* thread-local table */ +typedef struct { + _tlocal_ctx_t **ctx_tbl; + size_t ctx_tbl_size; + _tlocal_mem_t **mem_tbl; + size_t mem_tbl_size; +} _tlocal_table_t; + +static pthread_key_t _tlocal_key = {0}; + +/***********************************************************************/ + extern mca_base_framework_t opal_memory_base_framework; opal_common_ucx_module_t opal_common_ucx = { diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index e25dd23b821..a89cd051126 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -96,6 +96,44 @@ typedef struct opal_common_ucx_del_proc { extern opal_common_ucx_module_t opal_common_ucx; +typedef struct { + ucp_context_h ucp_ctx; + opal_mutex_t mutex; + opal_list_t idle_workers; + ucp_worker_h recv_worker; + ucp_address_t *recv_waddr; + size_t recv_waddr_len; + int cur_ctxid, cur_memid; +} opal_common_ucx_wpool_t; + +typedef struct { + int ctx_id; + opal_mutex_t mutex; + opal_common_ucx_wpool_t *wpool; /* which wpool this ctx belongs to */ + opal_list_t workers; /* active worker lists */ + char *recv_worker_addrs; + int *recv_worker_displs; +} opal_common_ucx_ctx_t; + +typedef struct { + int mem_id; + opal_mutex_t mutex; + opal_common_ucx_ctx_t *ctx; /* which ctx this mem_reg belongs to */ + opal_list_t mem_regions; /* mem region lists */ + char *mem_addrs; + int *mem_displs; +} opal_common_ucx_mem_t; + +typedef enum { + OPAL_COMMON_UCX_PUT, + OPAL_COMMON_UCX_GET +} opal_common_ucx_op_t; + +typedef enum { + OPAL_COMMON_UCX_SCOPE_EP, + OPAL_COMMON_UCX_SCOPE_WORKER +} opal_common_ucx_flush_scope_t; + OPAL_DECLSPEC void opal_common_ucx_mca_register(void); OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void); OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status); From 34153d354fa291ab6774fdb97916a99b09952b72 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 12 Nov 2018 08:40:41 -0800 Subject: [PATCH 03/59] init/finalize functions --- opal/mca/common/ucx/common_ucx.c | 148 +++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index f05a7879154..a6b1c98ff13 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -286,3 +286,151 @@ OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, s return OPAL_SUCCESS; } +/***********************************************************************/ + +static inline void _cleanup_tlocal(void *arg) +{ + // 1. Cleanup all rkeys in the window table + // 2. Return all workers into the idle pool +} + +static ucp_worker_h _create_ctx_worker(opal_common_ucx_wpool_t *wpool) +{ + ucp_worker_params_t worker_params; + ucp_worker_h worker; + ucs_status_t status; + + memset(&worker_params, 0, sizeof(worker_params)); + worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; + worker_params.thread_mode = UCS_THREAD_MODE_SINGLE; + status = ucp_worker_create(wpool->ucp_ctx, &worker_params, &worker); + if (UCS_OK != status) { + return NULL; + } + + return worker; +} + +static void _wpool_add_to_idle(opal_common_ucx_wpool_t *wpool, + _worker_engine_t *wkr) +{ + _idle_list_item_t *item; + + if(wkr->comm_size != 0) { + int i; + for (i = 0; i < wkr->comm_size; i++) { + ucp_ep_destroy(wkr->endpoints[i]); + } + free(wkr->endpoints); + wkr->endpoints = NULL; + wkr->comm_size = 0; + } + + item = OBJ_NEW(_idle_list_item_t); + item->ptr = wkr; + + opal_mutex_lock(&wpool->mutex); + opal_list_append(&wpool->idle_workers, &item->super); + opal_mutex_unlock(&wpool->mutex); +} + +static _worker_engine_t* _wpool_remove_from_idle(opal_common_ucx_wpool_t *wpool) +{ + _worker_engine_t *wkr = NULL; + _idle_list_item_t *item = NULL; + + opal_mutex_lock(&wpool->mutex); + if (!opal_list_is_empty(&wpool->idle_workers)) { + item = (_idle_list_item_t *)opal_list_get_first(&wpool->idle_workers); + opal_list_remove_item(&wpool->idle_workers, &item->super); + } + opal_mutex_unlock(&wpool->mutex); + + if (item != NULL) { + wkr = item->ptr; + OBJ_RELEASE(item); + } + + return wkr; +} + + +OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, + int proc_world_size, + ucp_request_init_callback_t req_init_ptr, + size_t req_size) +{ + ucp_config_t *config = NULL; + ucp_params_t context_params; + _worker_engine_t *wkr; + ucs_status_t status; + int ret = OPAL_SUCCESS; + + wpool->cur_ctxid = wpool->cur_memid = 0; + OBJ_CONSTRUCT(&wpool->mutex, opal_mutex_t); + + status = ucp_config_read("MPI", NULL, &config); + if (UCS_OK != status) { + MCA_COMMON_UCX_VERBOSE(1, "ucp_config_read failed: %d", status); + return OPAL_ERROR; + } + + /* initialize UCP context */ + memset(&context_params, 0, sizeof(context_params)); + context_params.field_mask = UCP_PARAM_FIELD_FEATURES | + UCP_PARAM_FIELD_MT_WORKERS_SHARED | + UCP_PARAM_FIELD_ESTIMATED_NUM_EPS | + UCP_PARAM_FIELD_REQUEST_INIT | + UCP_PARAM_FIELD_REQUEST_SIZE; + context_params.features = UCP_FEATURE_RMA | UCP_FEATURE_AMO32 | UCP_FEATURE_AMO64; + context_params.mt_workers_shared = 1; + context_params.estimated_num_eps = proc_world_size; + context_params.request_init = req_init_ptr; + context_params.request_size = req_size; + + status = ucp_init(&context_params, config, &wpool->ucp_ctx); + ucp_config_release(config); + if (UCS_OK != status) { + MCA_COMMON_UCX_VERBOSE(1, "ucp_init failed: %d", status); + ret = OPAL_ERROR; + goto err_ucp_init; + } + + /* create recv worker and add to idle pool */ + OBJ_CONSTRUCT(&wpool->idle_workers, opal_list_t); + wpool->recv_worker = _create_ctx_worker(wpool); + if (wpool->recv_worker == NULL) { + MCA_COMMON_UCX_VERBOSE(1, "_create_ctx_worker failed"); + ret = OPAL_ERROR; + goto err_worker_create; + } + + wkr = OBJ_NEW(_worker_engine_t); + OBJ_CONSTRUCT(&wkr->mutex, opal_mutex_t); + wkr->worker = wpool->recv_worker; + wkr->endpoints = NULL; + wkr->comm_size = 0; + + _wpool_add_to_idle(wpool, wkr); + + status = ucp_worker_get_address(wpool->recv_worker, + &wpool->recv_waddr, &wpool->recv_waddr_len); + if (status != UCS_OK) { + MCA_COMMON_UCX_VERBOSE(1, "ucp_worker_get_address failed: %d", status); + ret = OPAL_ERROR; + goto err_get_addr; + } + + pthread_key_create(&_tlocal_key, _cleanup_tlocal); + + return ret; + + err_get_addr: + if (NULL != wpool->recv_worker) { + ucp_worker_destroy(wpool->recv_worker); + } + err_worker_create: + ucp_cleanup(wpool->ucp_ctx); + err_ucp_init: + return ret; +} From 817cb28f2fb07c3e1aaacc8540eff17adfce64d0 Mon Sep 17 00:00:00 2001 From: "Artem Y. Polyakov" Date: Mon, 12 Nov 2018 09:13:18 -0800 Subject: [PATCH 04/59] change datastruct --- opal/mca/common/ucx/common_ucx.c | 28 +++++++++++++++++----------- opal/mca/common/ucx/common_ucx.h | 1 + 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index a6b1c98ff13..1363313e4f2 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -24,20 +24,21 @@ typedef struct { ucp_worker_h worker; ucp_ep_h *endpoints; int comm_size; -} _worker_engine_t; +} _worker_info_t; -OBJ_CLASS_DECLARATION(_worker_engine_t); +OBJ_CLASS_DECLARATION(_worker_info_t); typedef struct { int ctx_id; + int is_freed; opal_common_ucx_ctx_t *gctx; - _worker_engine_t *worker; + _worker_info_t *worker; } _tlocal_ctx_t; OBJ_CLASS_DECLARATION(_tlocal_ctx_t); typedef struct { - _worker_engine_t *worker; + _worker_info_t *worker; ucp_rkey_h *rkeys; } _mem_info_t; @@ -53,7 +54,7 @@ OBJ_CLASS_DECLARATION(_tlocal_mem_t); typedef struct { opal_list_item_t super; - _worker_engine_t *ptr; + _worker_info_t *ptr; } _idle_list_item_t; OBJ_CLASS_DECLARATION(_idle_list_item_t); @@ -61,7 +62,7 @@ OBJ_CLASS_INSTANCE(_idle_list_item_t, opal_list_item_t, NULL, NULL); typedef struct { opal_list_item_t super; - _worker_engine_t *ptr; + _worker_info_t *ptr; } _worker_list_item_t; OBJ_CLASS_DECLARATION(_worker_list_item_t); @@ -77,12 +78,17 @@ OBJ_CLASS_INSTANCE(_mem_region_list_item_t, opal_list_item_t, NULL, NULL); /* thread-local table */ typedef struct { + opal_list_item_t super; + opal_common_ucx_wpool_t *wpool; _tlocal_ctx_t **ctx_tbl; size_t ctx_tbl_size; _tlocal_mem_t **mem_tbl; size_t mem_tbl_size; } _tlocal_table_t; +OBJ_CLASS_DECLARATION(_tlocal_table_t); +OBJ_CLASS_INSTANCE(_tlocal_table_t, opal_list_item_t, NULL, NULL); + static pthread_key_t _tlocal_key = {0}; /***********************************************************************/ @@ -312,7 +318,7 @@ static ucp_worker_h _create_ctx_worker(opal_common_ucx_wpool_t *wpool) } static void _wpool_add_to_idle(opal_common_ucx_wpool_t *wpool, - _worker_engine_t *wkr) + _worker_info_t *wkr) { _idle_list_item_t *item; @@ -334,9 +340,9 @@ static void _wpool_add_to_idle(opal_common_ucx_wpool_t *wpool, opal_mutex_unlock(&wpool->mutex); } -static _worker_engine_t* _wpool_remove_from_idle(opal_common_ucx_wpool_t *wpool) +static _worker_info_t* _wpool_remove_from_idle(opal_common_ucx_wpool_t *wpool) { - _worker_engine_t *wkr = NULL; + _worker_info_t *wkr = NULL; _idle_list_item_t *item = NULL; opal_mutex_lock(&wpool->mutex); @@ -362,7 +368,7 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, { ucp_config_t *config = NULL; ucp_params_t context_params; - _worker_engine_t *wkr; + _worker_info_t *wkr; ucs_status_t status; int ret = OPAL_SUCCESS; @@ -405,7 +411,7 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, goto err_worker_create; } - wkr = OBJ_NEW(_worker_engine_t); + wkr = OBJ_NEW(_worker_info_t); OBJ_CONSTRUCT(&wkr->mutex, opal_mutex_t); wkr->worker = wpool->recv_worker; wkr->endpoints = NULL; diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index a89cd051126..a9c26218562 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -104,6 +104,7 @@ typedef struct { ucp_address_t *recv_waddr; size_t recv_waddr_len; int cur_ctxid, cur_memid; + opal_list_t tls_list; } opal_common_ucx_wpool_t; typedef struct { From 366ab6a9b3e36c3a18bf203facb929123b957642 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 12 Nov 2018 09:54:08 -0800 Subject: [PATCH 05/59] data structure --- opal/mca/common/ucx/common_ucx.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index a9c26218562..935c6aa857f 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -114,15 +114,20 @@ typedef struct { opal_list_t workers; /* active worker lists */ char *recv_worker_addrs; int *recv_worker_displs; + int comm_size; } opal_common_ucx_ctx_t; typedef struct { int mem_id; opal_mutex_t mutex; opal_common_ucx_ctx_t *ctx; /* which ctx this mem_reg belongs to */ + ucp_mem_h memh; + void *rkey_addr; + size_t rkey_addr_len; opal_list_t mem_regions; /* mem region lists */ char *mem_addrs; int *mem_displs; + int comm_size; } opal_common_ucx_mem_t; typedef enum { @@ -135,6 +140,15 @@ typedef enum { OPAL_COMMON_UCX_SCOPE_WORKER } opal_common_ucx_flush_scope_t; +typedef enum { + OPAL_COMMON_UCX_MEM_ALLOCATE_MAP, + OPAL_COMMON_UCX_MEM_MAP +} opal_common_ucx_mem_type_t; + +typedef int (*opal_common_ucx_exchange_func_t)(void *my_info, size_t my_info_len, + char **recv_info, int **disps, + void *metadata); + OPAL_DECLSPEC void opal_common_ucx_mca_register(void); OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void); OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status); From 30756383e4ef3f7c54c6c8060d60b6cd78f43018 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 12 Nov 2018 09:27:48 -0800 Subject: [PATCH 06/59] init/finalize funcs --- opal/mca/common/ucx/common_ucx.c | 176 +++++++++++++++++++++++++++++++ opal/mca/common/ucx/common_ucx.h | 19 ++++ 2 files changed, 195 insertions(+) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 1363313e4f2..b523af6542c 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -360,6 +360,16 @@ static _worker_info_t* _wpool_remove_from_idle(opal_common_ucx_wpool_t *wpool) return wkr; } +OPAL_DECLSPEC opal_common_ucx_wpool_t * opal_common_ucx_wpool_allocate() +{ + opal_common_ucx_wpool_t *ptr = calloc(1, sizeof(opal_common_ucx_wpool_t *)); + return ptr; +} + +OPAL_DECLSPEC void opal_common_ucx_wpool_free(opal_common_ucx_wpool_t *wpool) +{ + free(wpool); +} OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, int proc_world_size, @@ -440,3 +450,169 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, err_ucp_init: return ret; } + + +OPAL_DECLSPEC void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) +{ + /* Go over the list, free idle list items */ + opal_mutex_lock(&wpool->mutex); + if (!opal_list_is_empty(&wpool->idle_workers)) { + _idle_list_item_t *item, *next; + OPAL_LIST_FOREACH_SAFE(item, next, &wpool->idle_workers, _idle_list_item_t) { + _worker_info_t *curr_worker; + opal_list_remove_item(&wpool->idle_workers, &item->super); + curr_worker = item->ptr; + OBJ_DESTRUCT(&curr_worker->mutex); + ucp_worker_destroy(curr_worker->worker); + OBJ_RELEASE(curr_worker); + OBJ_RELEASE(item); + } + } + opal_mutex_unlock(&wpool->mutex); + + OBJ_DESTRUCT(&wpool->idle_workers); + + OBJ_DESTRUCT(&wpool->mutex); + ucp_worker_release_address(wpool->recv_worker, wpool->recv_waddr); + ucp_worker_destroy(wpool->recv_worker); + ucp_cleanup(wpool->ucp_ctx); +} + +OPAL_DECLSPEC int opal_common_ucx_ctx_create(opal_common_ucx_wpool_t *wpool, int comm_size, + opal_common_ucx_exchange_func_t exchange_func, + void *exchange_metadata, + opal_common_ucx_ctx_t **ctx_ptr) +{ + opal_common_ucx_ctx_t *ctx = calloc(1, sizeof(*ctx)); + int ret = OPAL_SUCCESS; + + ctx->ctx_id = OPAL_ATOMIC_ADD_FETCH32(&ctx->ctx_id, 1); + + OBJ_CONSTRUCT(&ctx->mutex, opal_mutex_t); + OBJ_CONSTRUCT(&ctx->workers, opal_list_t); + ctx->wpool = wpool; + ctx->comm_size = comm_size; + + ctx->recv_worker_addrs = NULL; + ctx->recv_worker_displs = NULL; + ret = exchange_func(wpool->recv_waddr, wpool->recv_waddr_len, + &ctx->recv_worker_addrs, + &ctx->recv_worker_displs, exchange_metadata); + if (ret != OPAL_SUCCESS) { + goto error; + } + + (*ctx_ptr) = ctx; + return ret; + + error: + OBJ_DESTRUCT(&ctx->mutex); + OBJ_DESTRUCT(&ctx->workers); + free(ctx); + (*ctx_ptr) = NULL; + return ret; +} + +static int _comm_ucx_mem_map(opal_common_ucx_wpool_t *wpool, + void **base, size_t size, ucp_mem_h *memh_ptr, + opal_common_ucx_mem_type_t mem_type) +{ + ucp_mem_map_params_t mem_params; + ucp_mem_attr_t mem_attrs; + ucs_status_t status; + int ret = OPAL_SUCCESS; + + memset(&mem_params, 0, sizeof(ucp_mem_map_params_t)); + mem_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | + UCP_MEM_MAP_PARAM_FIELD_LENGTH | + UCP_MEM_MAP_PARAM_FIELD_FLAGS; + mem_params.length = size; + if (mem_type == OPAL_COMMON_UCX_MEM_ALLOCATE_MAP) { + mem_params.address = NULL; + mem_params.flags = UCP_MEM_MAP_ALLOCATE; + } else { + mem_params.address = (*base); + } + + status = ucp_mem_map(wpool->ucp_ctx, &mem_params, memh_ptr); + if (status != UCS_OK) { + MCA_COMMON_UCX_VERBOSE(1, "ucp_mem_map failed: %d", status); + ret = OPAL_ERROR; + goto error; + } + + mem_attrs.field_mask = UCP_MEM_ATTR_FIELD_ADDRESS | UCP_MEM_ATTR_FIELD_LENGTH; + status = ucp_mem_query((*memh_ptr), &mem_attrs); + if (status != UCS_OK) { + MCA_COMMON_UCX_VERBOSE(1, "ucp_mem_query failed: %d", status); + ret = OPAL_ERROR; + goto error; + } + + assert(mem_attrs.length >= size); + if (mem_type != OPAL_COMMON_UCX_MEM_ALLOCATE_MAP) { + assert(mem_attrs.address == (*base)); + } else { + (*base) = mem_attrs.address; + } + + return ret; + error: + ucp_mem_unmap(wpool->ucp_ctx, (*memh_ptr)); + return ret; +} + + +OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int comm_size, + void **mem_base, size_t mem_size, + opal_common_ucx_mem_type_t mem_type, + opal_common_ucx_exchange_func_t exchange_func, + void *exchange_metadata, + opal_common_ucx_mem_t **mem_ptr) +{ + opal_common_ucx_mem_t *mem = calloc(1, sizeof(*mem)); + ucs_status_t status; + int ret = OPAL_SUCCESS; + + mem->mem_id = OPAL_ATOMIC_ADD_FETCH32(&mem->mem_id, 1); + OBJ_CONSTRUCT(&mem->mutex, opal_mutex_t); + OBJ_CONSTRUCT(&mem->mem_regions, opal_list_t); + mem->ctx = ctx; + mem->comm_size = comm_size; + mem->mem_addrs = NULL; + mem->mem_displs = NULL; + + ret = _comm_ucx_mem_map(ctx->wpool, mem_base, mem_size, &mem->memh, mem_type); + if (ret != OPAL_SUCCESS) { + MCA_COMMON_UCX_VERBOSE(1, "_comm_ucx_mem_map failed: %d", ret); + goto error_mem_map; + } + + status = ucp_rkey_pack(ctx->wpool->ucp_ctx, mem->memh, + &mem->rkey_addr, &mem->rkey_addr_len); + if (status != UCS_OK) { + MCA_COMMON_UCX_VERBOSE(1, "ucp_rkey_pack failed: %d", status); + ret = OPAL_ERROR; + goto error_rkey_pack; + } + + ret = exchange_func(mem->rkey_addr, mem->rkey_addr_len, + &mem->mem_addrs, &mem->mem_displs, exchange_metadata); + if (ret != OPAL_SUCCESS) { + goto error_exchange; + } + + (*mem_ptr) = mem; + return ret; + + error_exchange: + ucp_rkey_buffer_release(mem->rkey_addr); + error_rkey_pack: + ucp_mem_unmap(ctx->wpool->ucp_ctx, mem->memh); + error_mem_map: + OBJ_DESTRUCT(&mem->mutex); + OBJ_DESTRUCT(&mem->mem_regions); + free(mem); + (*mem_ptr) = NULL; + return ret; +} diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 935c6aa857f..21bd8ddec81 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -149,6 +149,25 @@ typedef int (*opal_common_ucx_exchange_func_t)(void *my_info, size_t my_info_len char **recv_info, int **disps, void *metadata); +OPAL_DECLSPEC opal_common_ucx_wpool_t * opal_common_ucx_wpool_allocate(); +OPAL_DECLSPEC void opal_common_ucx_wpool_free(opal_common_ucx_wpool_t *wpool); +OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, + int proc_world_size, + ucp_request_init_callback_t req_init_ptr, + size_t req_size); +OPAL_DECLSPEC void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool); +OPAL_DECLSPEC int opal_common_ucx_ctx_create(opal_common_ucx_wpool_t *wpool, int comm_size, + opal_common_ucx_exchange_func_t exchange_func, + void *exchange_metadata, + opal_common_ucx_ctx_t **ctx_ptr); +OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int comm_size, + void **mem_base, size_t mem_size, + opal_common_ucx_mem_type_t mem_type, + opal_common_ucx_exchange_func_t exchange_func, + void *exchange_metadata, + opal_common_ucx_mem_t **mem_ptr); + + OPAL_DECLSPEC void opal_common_ucx_mca_register(void); OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void); OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status); From d3926f7af7cebbea258f674f5931f63cd3e0ef0b Mon Sep 17 00:00:00 2001 From: "Artem Y. Polyakov" Date: Mon, 12 Nov 2018 11:14:52 -0800 Subject: [PATCH 07/59] Add Operation function --- opal/mca/common/ucx/common_ucx.c | 265 ++++++++++++++++++++++++++++++- 1 file changed, 264 insertions(+), 1 deletion(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index b523af6542c..3a109a7287f 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -46,6 +46,7 @@ OBJ_CLASS_DECLARATION(_mem_info_t); typedef struct { int mem_id; + int is_freed; opal_common_ucx_mem_t *gmem; _mem_info_t *mem; } _tlocal_mem_t; @@ -451,7 +452,6 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, return ret; } - OPAL_DECLSPEC void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) { /* Go over the list, free idle list items */ @@ -616,3 +616,266 @@ OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int com (*mem_ptr) = NULL; return ret; } + +static int +_tlocal_extend_ctxtbl(_tlocal_table_t *tbl, size_t append) +{ + size_t i; + tbl->ctx_tbl = realloc(tbl->ctx_tbl, newsize * sizeof(*tbl->ctx_tbl)); + for (i = tbl->ctx_tbl_size; i < tbl->ctx_tbl_size + append; i++) { + tbl->ctx_tbl[i] = calloc(1, sizeof(_thr_local_cctx_t)); + if (NULL == tbl->ctx_tbl[i]) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + } + tbl->ctx_tbl_size += append; + return OPAL_SUCCESS; +} +static int +_tlocal_extend_memtbl(_tlocal_table_t *tbl, size_t append) +{ + size_t i; + tbl->mem_tbl = realloc(tbl->mem_tbl, newsize * sizeof(*tbl->mem_tbl)); + for (i = tbl->mem_tbl_size; i < tbl->mem_tbl_size + append; i++) { + tbl->mem_tbl[i] = calloc(1, sizeof(*tbl->mem_tbl[i])); + if (NULL == tbl->mem_tbl[i]) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + } + tbl->mem_tbl_size += append; + return OPAL_SUCCESS; +} + +// TODO: don't want to inline this function +static _tlocal_table_t* _common_ucx_init_tls(opal_common_ucx_wpool_t *wpool) +{ + _tlocal_table_t *tls = NULL; + tls = OBJ_NEW(_tlocal_table_t); + memset(tls, 0, sizeof(tls)); + + /* Add this TLS to the global wpool structure for future + * cleanup purposes */ + tls->wpool = wpool; + opal_mutex_lock(&wpool->mutex); + opal_list_append(&wpool->tls_list, &tls->super); + opal_mutex_unlock(&wpool->mutex); + + if( _tlocal_extend_ctxtbl(tls, 4) ){ + // TODO: handle error + } + if(_tlocal_extend_memtbl(tls, 4)) { + // TODO: handle error + } + pthread_set_specific(_tlocal_key, tls); + return tls; +} + +static inline _worker_info_t *_tlocal_search_ctx(_tlocal_table_t *tls, int ctx_id) +{ + int i; + for(i=0; ictx_tbl_size; i++) { + if( tls->ctx_tbl[i] == ctx_id){ + return tls->ctx_tbl[i]->worker; + } + } + return NULL; +} + +// TODO: Don't want to inline this (slow path) +static _worker_info_t *_tlocal_add_ctx(_tlocal_table_t *tls, + opal_common_ucx_ctx_t *ctx) +{ + int i, rc; + + /* Try to find tavailable spot in the table */ + for (i=0; ictx_tbl_size; i++) { + if (tls->ctx_tbl[i]->is_freed) { + /* Cleanup the record */ + _tlocal_cleanup_ctx_record(tls->ctx_tbl[i]); + break; + } + } + + if( tls->ctx_tbl_size >= i ){ + i = tls->ctx_tbl_size; + if( rc = _tlocal_extend_ctxtbl(tls, 4) ){ + //TODO: error out + return NULL; + } + } + tls->ctx_tbl[i]->ctx_id = ctx->ctx_id; + tls->ctx_tbl[i]->gctx = ctx; + tls->ctx_tbl[i]->is_freed = 0; + tls->ctx_tbl[i]->worker = _get_new_worker(tls); + + /* Make sure that we completed all the data structures before + * placing the item to the list + * NOTE: essentially we don't need this as list append is an + * operation protected by mutex + */ + opal_atomic_wmb(); + + _ctx_append_worker(ctx, tls->ctx_tbl[i]); + + return tls->ctx_tbl[i]; +} + +static int _tlocal_ctx_connect(_tlocal_ctx_t *ctx, int target) +{ + ucp_ep_params_t ep_params; + _worker_info_t *winfo = ctx->worker; + opal_common_ucx_ctx_t *gctx = ctx->gctx; + int displ; + + memset(&ep_params, 0, sizeof(ucp_ep_params_t)); + ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; + + opal_mutex_lock(&winfo->ctx->mutex); + displ = gctx->recv_worker_displs[target]; + ep_params.address = (ucp_address_t *)&(gctx->recv_worker_addrs[disp]); + status = ucp_ep_create(winfo->worker, &ep_params, &winfo->endpoints[target]); + if (status != UCS_OK) { +// TODO: error out here +// OSC_UCX_VERBOSE(1, "ucp_ep_create failed: %d", status); + ret = OPAL_ERROR; + } + return OPAL_SUCCESS; +} + +static inline _worker_info_t * +_tlocal_search_mem(_tlocal_table_t *tls, int mem_id) +{ + int i; + for(i=0; imem_tbl_size; i++) { + if( tls->mem_tbl[i] == mem_id){ + return tls->mem_tbl[i]->mem; + } + } + return NULL; +} + +// TODO: Don't want to inline this (slow path) +static _worker_info_t *_tlocal_add_mem(_tlocal_table_t *tls, + opal_common_ucx_ctx_t *mem) +{ + int i, rc; + + /* Try to find tavailable spot in the table */ + for (i=0; imem_tbl_size; i++) { + if (tls->mem_tbl[i]->is_freed) { + /* Cleanup the record */ + _tlocal_cleanup_mem_record(tls->mem_tbl[i]); + break; + } + } + + if( tls->mem_tbl_size >= i ){ + i = tls->mem_tbl_size; + if( rc = _tlocal_extend_memtbl(tls, 4) ){ + //TODO: error out + return NULL; + } + } + tls->mem_tbl[i]->mem_id = mem->mem_id; + tls->mem_tbl[i]->gmem = mem; + tls->mem_tbl[i]->is_freed = 0; + tls->mem_tbl[i]->mem = _get_new_memory(tls, mem); + + /* Make sure that we completed all the data structures before + * placing the item to the list + * NOTE: essentially we don't need this as list append is an + * operation protected by mutex + */ + opal_atomic_wmb(); + + _mem_append_rkey(mem, tls->mem_tbl[i]); + + return tls->mem_tbl[i]; +} + +static int _tlocal_mem_create_rkey(_tlocal_mem_t *mem_rec, ucp_ep_h ep, int target) +{ + _mem_info_t *minfo = mem_rec->mem; + opal_common_ucx_mem_t *gmem = mem_rec->gmem; + int displ = gmem->mem_displs[target]; + + status = ucp_ep_rkey_unpack(ep, &gmem->mem_addrs[displ], + &minfo->rkeys[target]); + if (status != UCS_OK) { + // TODO: error out here + // OSC_UCX_VERBOSE(1, "ucp_ep_create failed: %d", status); + ret = OPAL_ERROR; + } + return OPAL_SUCCESS; +} + +OPAL_DECLSPEC int opal_common_ucx_mem_op(opal_common_ucx_mem_t *mem, + opal_common_ucx_op_t op, + int target, + void *buffer, size_t len, + uint64_t rem_addr) +{ + _tlocal_table_t *tls = NULL; + _worker_info_t *worker_info; + _mem_info_t *mem_info; + ucp_ep_h ep; + ucp_rkey_h rkey; + + tls = pthread_get_specific(_tlocal_key); + if( OPAL_UNLIKELY(NULL == tls) ) { + tls = _common_ucx_init_tls(mem->ctx->wpool); + } + /* Obtain the worker structure */ + worker_info = _tlocal_search_ctx(tls, mem->ctx->ctx_id); + if (OPAL_UNLIKELY(NULL == worker_info)) { + worker_info = _tlocal_add_ctx(tls, mem->ctx); + } + + /* Obtain the endpoint */ + if (OPAL_UNLIKELY(NULL == worker_info->endpoints[target])) { + if (rc = _tlocal_ctx_connect(worker_info, target)) { + return rc; + } + } + ep = worker_info->endpoints[target]; + + /* Obtain the memory region info */ + mem_info = _tlocal_search_mem(tls, mem->mem_id); + if (OPAL_UNLIKELY(mem_info == NULL)) { + mem_info = _tlocal_add_mem(tls, mem->mem_id); + } + + /* Obtain the rkey */ + if (OPAL_UNLIKELY(NULL == mem_info->rkeys[target])) { + // Create the rkey + if (rc = _tlocal_mem_rkey_create(mem_info, target)) { + return rc; + } + } + rkey = mem_info->rkeys[target]; + + /* Perform the operation */ + opal_mutex_lock(worker_info->mutex); + switch(op){ + case OPAL_COMMON_UCX_GET: + status = ucp_put_nbi(ep, buffer,len, rem_addr, rkey); + if (status != UCS_OK && status != UCS_INPROGRESS) { + // TODO: Fix the output + // OSC_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status); + return OPAL_ERROR; + } + break; + case OPAL_COMMON_UCX_PUT: + status = ucp_get_nbi(ep, buffer,len, rem_addr, rkey); + if (status != UCS_OK && status != UCS_INPROGRESS) { + // TODO: Fix the output + // OSC_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status); + return OPAL_ERROR; + } + break; + } + opal_mutex_unlock(worker_info->mutex); +} + From a6d75bc4e381dd342a28e7d76c1057eaea2bc2f4 Mon Sep 17 00:00:00 2001 From: "Artem Y. Polyakov" Date: Mon, 12 Nov 2018 11:39:43 -0800 Subject: [PATCH 08/59] Add flush function --- opal/mca/common/ucx/common_ucx.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 3a109a7287f..75135d0877a 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -32,7 +32,7 @@ typedef struct { int ctx_id; int is_freed; opal_common_ucx_ctx_t *gctx; - _worker_info_t *worker; + _worker_info_t *winfo; } _tlocal_ctx_t; OBJ_CLASS_DECLARATION(_tlocal_ctx_t); @@ -63,7 +63,7 @@ OBJ_CLASS_INSTANCE(_idle_list_item_t, opal_list_item_t, NULL, NULL); typedef struct { opal_list_item_t super; - _worker_info_t *ptr; + _tlocal_ctx_t *ptr; } _worker_list_item_t; OBJ_CLASS_DECLARATION(_worker_list_item_t); @@ -708,7 +708,7 @@ static _worker_info_t *_tlocal_add_ctx(_tlocal_table_t *tls, tls->ctx_tbl[i]->ctx_id = ctx->ctx_id; tls->ctx_tbl[i]->gctx = ctx; tls->ctx_tbl[i]->is_freed = 0; - tls->ctx_tbl[i]->worker = _get_new_worker(tls); + tls->ctx_tbl[i]->winfo = _get_new_worker(tls, ctx); /* Make sure that we completed all the data structures before * placing the item to the list @@ -725,7 +725,7 @@ static _worker_info_t *_tlocal_add_ctx(_tlocal_table_t *tls, static int _tlocal_ctx_connect(_tlocal_ctx_t *ctx, int target) { ucp_ep_params_t ep_params; - _worker_info_t *winfo = ctx->worker; + _worker_info_t *winfo = ctx->winfo; opal_common_ucx_ctx_t *gctx = ctx->gctx; int displ; @@ -879,3 +879,23 @@ OPAL_DECLSPEC int opal_common_ucx_mem_op(opal_common_ucx_mem_t *mem, opal_mutex_unlock(worker_info->mutex); } +int opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, + opal_common_ucx_flush_scope_t scope, + int target) +{ + _worker_list_item_t *item; + opal_mutex_lock(&mem->ctx->mutex); + OPAL_LIST_FOREACH(item, &ctx->workers, _worker_list_item_t) { + switch (scope) { + case OPAL_COMMON_UCX_SCOPE_WORKER: + opal_common_ucx_worker_flush(item->ptr->winfo->worker); + break; + case OPAL_COMMON_UCX_SCOPE_EP: + if (NULL != item->ptr->winfo->endpoints[target] ) { + opal_common_ucx_ep_flush(item->ptr->winfo->endpoints[target], + item->ptr->winfo->worker); + } + } + } + opal_mutex_unlock(&mem->ctx->mutex); +} From 113f2fffec987ba2a0c6885cf4b38f8a424cc45b Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 12 Nov 2018 11:41:31 -0800 Subject: [PATCH 09/59] modify mem_create --- opal/mca/common/ucx/common_ucx.c | 11 ++++++----- opal/mca/common/ucx/common_ucx.h | 2 -- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 75135d0877a..ecfee70ca70 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -571,6 +571,8 @@ OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int com opal_common_ucx_mem_t **mem_ptr) { opal_common_ucx_mem_t *mem = calloc(1, sizeof(*mem)); + void *rkey_addr = NULL; + size_t rkey_addr_len; ucs_status_t status; int ret = OPAL_SUCCESS; @@ -589,24 +591,23 @@ OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int com } status = ucp_rkey_pack(ctx->wpool->ucp_ctx, mem->memh, - &mem->rkey_addr, &mem->rkey_addr_len); + &rkey_addr, &rkey_addr_len); if (status != UCS_OK) { MCA_COMMON_UCX_VERBOSE(1, "ucp_rkey_pack failed: %d", status); ret = OPAL_ERROR; goto error_rkey_pack; } - ret = exchange_func(mem->rkey_addr, mem->rkey_addr_len, + ret = exchange_func(rkey_addr, rkey_addr_len, &mem->mem_addrs, &mem->mem_displs, exchange_metadata); + ucp_rkey_buffer_release(rkey_addr); if (ret != OPAL_SUCCESS) { - goto error_exchange; + goto error_rkey_pack; } (*mem_ptr) = mem; return ret; - error_exchange: - ucp_rkey_buffer_release(mem->rkey_addr); error_rkey_pack: ucp_mem_unmap(ctx->wpool->ucp_ctx, mem->memh); error_mem_map: diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 21bd8ddec81..35bf9def9c5 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -122,8 +122,6 @@ typedef struct { opal_mutex_t mutex; opal_common_ucx_ctx_t *ctx; /* which ctx this mem_reg belongs to */ ucp_mem_h memh; - void *rkey_addr; - size_t rkey_addr_len; opal_list_t mem_regions; /* mem region lists */ char *mem_addrs; int *mem_displs; From 2f1ac683ce5db2f9184153fe7ca0ff9472e05bcc Mon Sep 17 00:00:00 2001 From: "Artem Y. Polyakov" Date: Mon, 12 Nov 2018 12:32:18 -0800 Subject: [PATCH 10/59] Intermediate fixes --- opal/mca/common/ucx/common_ucx.c | 101 ++++++++++++++++++++++++++----- opal/mca/common/ucx/common_ucx.h | 6 +- 2 files changed, 90 insertions(+), 17 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 75135d0877a..2de3b80c8ec 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -16,6 +16,7 @@ #include "opal/memoryhooks/memory.h" #include +#include /***********************************************************************/ @@ -318,7 +319,7 @@ static ucp_worker_h _create_ctx_worker(opal_common_ucx_wpool_t *wpool) return worker; } -static void _wpool_add_to_idle(opal_common_ucx_wpool_t *wpool, +static int _wpool_add_to_idle(opal_common_ucx_wpool_t *wpool, _worker_info_t *wkr) { _idle_list_item_t *item; @@ -334,11 +335,15 @@ static void _wpool_add_to_idle(opal_common_ucx_wpool_t *wpool, } item = OBJ_NEW(_idle_list_item_t); + if (NULL == item) { + return OPAL_ERR_OUT_OF_RESOURCE; + } item->ptr = wkr; opal_mutex_lock(&wpool->mutex); opal_list_append(&wpool->idle_workers, &item->super); opal_mutex_unlock(&wpool->mutex); + return OPAL_SUCCESS; } static _worker_info_t* _wpool_remove_from_idle(opal_common_ucx_wpool_t *wpool) @@ -361,7 +366,7 @@ static _worker_info_t* _wpool_remove_from_idle(opal_common_ucx_wpool_t *wpool) return wkr; } -OPAL_DECLSPEC opal_common_ucx_wpool_t * opal_common_ucx_wpool_allocate() +OPAL_DECLSPEC opal_common_ucx_wpool_t * opal_common_ucx_wpool_allocate(void) { opal_common_ucx_wpool_t *ptr = calloc(1, sizeof(opal_common_ucx_wpool_t *)); return ptr; @@ -513,6 +518,48 @@ OPAL_DECLSPEC int opal_common_ucx_ctx_create(opal_common_ucx_wpool_t *wpool, int return ret; } +static int _common_ucx_ctx_free(opal_common_ucx_ctx_t *ctx) +{ + free(ctx->recv_worker_addrs); + free(ctx->recv_worker_displs); + OBJ_DESTRUCT(&ctx->mutex); + OBJ_DESTRUCT(&ctx->workers); + free(ctx); +} + +OPAL_DECLSPEC int opal_common_ucx_ctx_release(opal_common_ucx_ctx_t *ctx) +{ + // TODO: implement + _tlocal_ctx_release(ctx); +} + +static void +_common_ucx_ctx_remove(opal_common_ucx_ctx_t *ctx, _worker_info_t *ctx_rec) +{ + int can_free = 0; + _worker_list_item_t *item = NULL, next; + + opal_mutex_lock(&ctx->mutex); + OPAL_LIST_FOREACH_SAFE(item, next, &ctx->workers, _worker_list_item_t) { + if (ctx_rec == item->ptr) { + opal_list_remove_item(&ctx->workers, &item->super); + OBJ_RELEASE(item); + break; + } + } + if (0 == opal_list_get_size()) { + can_free = 1; + } + opal_mutex_unlock(&ctx->mutex); + + if (can_free) { + /* All references to this data structure are removed + * we can safely release communication context structure + */ + _common_ucx_ctx_free(ctx); + } +} + static int _comm_ucx_mem_map(opal_common_ucx_wpool_t *wpool, void **base, size_t size, ucp_mem_h *memh_ptr, opal_common_ucx_mem_type_t mem_type) @@ -621,30 +668,32 @@ static int _tlocal_extend_ctxtbl(_tlocal_table_t *tbl, size_t append) { size_t i; + size_t newsize = (tbl->ctx_tbl_size + append); tbl->ctx_tbl = realloc(tbl->ctx_tbl, newsize * sizeof(*tbl->ctx_tbl)); - for (i = tbl->ctx_tbl_size; i < tbl->ctx_tbl_size + append; i++) { - tbl->ctx_tbl[i] = calloc(1, sizeof(_thr_local_cctx_t)); + for (i = tbl->ctx_tbl_size; i < newsize; i++) { + tbl->ctx_tbl[i] = calloc(1, sizeof(*tbl->ctx_tbl[i])); if (NULL == tbl->ctx_tbl[i]) { return OPAL_ERR_OUT_OF_RESOURCE; } } - tbl->ctx_tbl_size += append; + tbl->ctx_tbl_size = newsize; return OPAL_SUCCESS; } static int _tlocal_extend_memtbl(_tlocal_table_t *tbl, size_t append) { size_t i; + size_t newsize = (tbl->ctx_tbl_size + append); + tbl->mem_tbl = realloc(tbl->mem_tbl, newsize * sizeof(*tbl->mem_tbl)); for (i = tbl->mem_tbl_size; i < tbl->mem_tbl_size + append; i++) { tbl->mem_tbl[i] = calloc(1, sizeof(*tbl->mem_tbl[i])); if (NULL == tbl->mem_tbl[i]) { return OPAL_ERR_OUT_OF_RESOURCE; } - } - tbl->mem_tbl_size += append; + tbl->mem_tbl_size = newsize; return OPAL_SUCCESS; } @@ -653,7 +702,7 @@ static _tlocal_table_t* _common_ucx_init_tls(opal_common_ucx_wpool_t *wpool) { _tlocal_table_t *tls = NULL; tls = OBJ_NEW(_tlocal_table_t); - memset(tls, 0, sizeof(tls)); + memset(tls, 0, sizeof(*tls)); /* Add this TLS to the global wpool structure for future * cleanup purposes */ @@ -668,26 +717,51 @@ static _tlocal_table_t* _common_ucx_init_tls(opal_common_ucx_wpool_t *wpool) if(_tlocal_extend_memtbl(tls, 4)) { // TODO: handle error } - pthread_set_specific(_tlocal_key, tls); + pthread_setspecific(_tlocal_key, tls); return tls; } static inline _worker_info_t *_tlocal_search_ctx(_tlocal_table_t *tls, int ctx_id) { - int i; + size_t i; for(i=0; ictx_tbl_size; i++) { - if( tls->ctx_tbl[i] == ctx_id){ - return tls->ctx_tbl[i]->worker; + if( tls->ctx_tbl[i]->ctx_id == ctx_id){ + return tls->ctx_tbl[i]->winfo; } } return NULL; } +static int _tlocal_cleanup_ctx_record(_tlocal_ctx_t *ctx_rec) +{ + int rc; + if (!ctx_rec->is_freed) { + return OPAL_SUCCESS; + } + /* Remove myself from the communication context structure + * This may result in context release as we are using + * delayed cleanup */ + rc = _common_ucx_ctx_remove(ctx_rec->gctx, ctx_rec); + if (rc) { + return rc; + } + + /* Return the worker back to the + * This may result in context release as we are using + * delayed cleanup */ + rc = _wpool_add_to_idle(ctx_rec->gctx->wpool, ctx_rec->winfo); + if (rc) { + return rc; + } + memset(ctx_rec, 0, sizeof(*ctx_rec)); +} + // TODO: Don't want to inline this (slow path) static _worker_info_t *_tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx) { - int i, rc; + size_t i; + int rc; /* Try to find tavailable spot in the table */ for (i=0; ictx_tbl_size; i++) { @@ -707,7 +781,6 @@ static _worker_info_t *_tlocal_add_ctx(_tlocal_table_t *tls, } tls->ctx_tbl[i]->ctx_id = ctx->ctx_id; tls->ctx_tbl[i]->gctx = ctx; - tls->ctx_tbl[i]->is_freed = 0; tls->ctx_tbl[i]->winfo = _get_new_worker(tls, ctx); /* Make sure that we completed all the data structures before diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 21bd8ddec81..7c393d5a3a9 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -108,7 +108,7 @@ typedef struct { } opal_common_ucx_wpool_t; typedef struct { - int ctx_id; + opal_atomic_int32_t ctx_id; opal_mutex_t mutex; opal_common_ucx_wpool_t *wpool; /* which wpool this ctx belongs to */ opal_list_t workers; /* active worker lists */ @@ -118,7 +118,7 @@ typedef struct { } opal_common_ucx_ctx_t; typedef struct { - int mem_id; + opal_atomic_int32_t mem_id; opal_mutex_t mutex; opal_common_ucx_ctx_t *ctx; /* which ctx this mem_reg belongs to */ ucp_mem_h memh; @@ -149,7 +149,7 @@ typedef int (*opal_common_ucx_exchange_func_t)(void *my_info, size_t my_info_len char **recv_info, int **disps, void *metadata); -OPAL_DECLSPEC opal_common_ucx_wpool_t * opal_common_ucx_wpool_allocate(); +OPAL_DECLSPEC opal_common_ucx_wpool_t * opal_common_ucx_wpool_allocate(void); OPAL_DECLSPEC void opal_common_ucx_wpool_free(opal_common_ucx_wpool_t *wpool); OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, int proc_world_size, From 3838e721543afd52dc308274c66063e8b07d6ff9 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 12 Nov 2018 13:39:00 -0800 Subject: [PATCH 11/59] modify on func signature --- opal/mca/common/ucx/common_ucx.c | 4 ++-- opal/mca/common/ucx/common_ucx.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 0c5e06e1f02..4407d5ac4f7 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -380,7 +380,7 @@ OPAL_DECLSPEC void opal_common_ucx_wpool_free(opal_common_ucx_wpool_t *wpool) OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, int proc_world_size, ucp_request_init_callback_t req_init_ptr, - size_t req_size) + size_t req_size, bool enable_mt) { ucp_config_t *config = NULL; ucp_params_t context_params; @@ -405,7 +405,7 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, UCP_PARAM_FIELD_REQUEST_INIT | UCP_PARAM_FIELD_REQUEST_SIZE; context_params.features = UCP_FEATURE_RMA | UCP_FEATURE_AMO32 | UCP_FEATURE_AMO64; - context_params.mt_workers_shared = 1; + context_params.mt_workers_shared = (enable_mt ? 1 : 0); context_params.estimated_num_eps = proc_world_size; context_params.request_init = req_init_ptr; context_params.request_size = req_size; diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 28ead2dfd0f..e7b9bf0299f 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -97,6 +97,7 @@ typedef struct opal_common_ucx_del_proc { extern opal_common_ucx_module_t opal_common_ucx; typedef struct { + int refcnt; ucp_context_h ucp_ctx; opal_mutex_t mutex; opal_list_t idle_workers; @@ -152,7 +153,7 @@ OPAL_DECLSPEC void opal_common_ucx_wpool_free(opal_common_ucx_wpool_t *wpool); OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, int proc_world_size, ucp_request_init_callback_t req_init_ptr, - size_t req_size); + size_t req_size, bool enable_mt); OPAL_DECLSPEC void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool); OPAL_DECLSPEC int opal_common_ucx_ctx_create(opal_common_ucx_wpool_t *wpool, int comm_size, opal_common_ucx_exchange_func_t exchange_func, From 78b31f561c97c1f7225185381e7573cf80e0dba8 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 12 Nov 2018 13:42:31 -0800 Subject: [PATCH 12/59] modify on wpool_init --- opal/mca/common/ucx/common_ucx.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 4407d5ac4f7..933163387cd 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -369,11 +369,13 @@ static _worker_info_t* _wpool_remove_from_idle(opal_common_ucx_wpool_t *wpool) OPAL_DECLSPEC opal_common_ucx_wpool_t * opal_common_ucx_wpool_allocate(void) { opal_common_ucx_wpool_t *ptr = calloc(1, sizeof(opal_common_ucx_wpool_t *)); + ptr->refcnt = 0; return ptr; } OPAL_DECLSPEC void opal_common_ucx_wpool_free(opal_common_ucx_wpool_t *wpool) { + assert(wpool->refcnt == 0); free(wpool); } @@ -388,6 +390,12 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, ucs_status_t status; int ret = OPAL_SUCCESS; + if (wpool->refcnt > 0) { + wpool->refcnt++; + return ret; + } + + wpool->refcnt++; wpool->cur_ctxid = wpool->cur_memid = 0; OBJ_CONSTRUCT(&wpool->mutex, opal_mutex_t); @@ -459,6 +467,11 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, OPAL_DECLSPEC void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) { + wpool->refcnt--; + if (wpool->refcnt > 0) { + return; + } + /* Go over the list, free idle list items */ opal_mutex_lock(&wpool->mutex); if (!opal_list_is_empty(&wpool->idle_workers)) { From 8eeb3248623445bc0bdfbb697a2da3f385693a27 Mon Sep 17 00:00:00 2001 From: "Artem Y. Polyakov" Date: Mon, 12 Nov 2018 16:23:48 -0800 Subject: [PATCH 13/59] Bring everything to a "builds OK" state --- opal/mca/common/ucx/common_ucx.c | 462 +++++++++++++++++++++++-------- opal/mca/common/ucx/common_ucx.h | 7 +- 2 files changed, 355 insertions(+), 114 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 933163387cd..d6bdae02f90 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -24,7 +24,7 @@ typedef struct { opal_mutex_t mutex; ucp_worker_h worker; ucp_ep_h *endpoints; - int comm_size; + size_t comm_size; } _worker_info_t; OBJ_CLASS_DECLARATION(_worker_info_t); @@ -72,7 +72,7 @@ OBJ_CLASS_INSTANCE(_worker_list_item_t, opal_list_item_t, NULL, NULL); typedef struct { opal_list_item_t super; - _mem_info_t *ptr; + _tlocal_mem_t *ptr; } _mem_region_list_item_t; OBJ_CLASS_DECLARATION(_mem_region_list_item_t); @@ -93,6 +93,33 @@ OBJ_CLASS_INSTANCE(_tlocal_table_t, opal_list_item_t, NULL, NULL); static pthread_key_t _tlocal_key = {0}; +OPAL_DECLSPEC int +opal_common_ucx_mem_op(opal_common_ucx_mem_t *mem, + opal_common_ucx_op_t op, + int target, + void *buffer, size_t len, + uint64_t rem_addr); +OPAL_DECLSPEC void opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, + opal_common_ucx_flush_scope_t scope, + int target); + +static int _tlocal_tls_ctxtbl_extend(_tlocal_table_t *tbl, size_t append); +static int _tlocal_tls_memtbl_extend(_tlocal_table_t *tbl, size_t append); +static _tlocal_table_t* _common_ucx_tls_init(opal_common_ucx_wpool_t *wpool); +static inline _tlocal_ctx_t *_tlocal_ctx_search(_tlocal_table_t *tls, + int ctx_id); +static int _tlocal_cleanup_ctx_record(_tlocal_ctx_t *ctx_rec); +static _tlocal_ctx_t *_tlocal_add_ctx(_tlocal_table_t *tls, + opal_common_ucx_ctx_t *ctx); +static int _tlocal_ctx_connect(_tlocal_ctx_t *ctx, int target); +static int _tlocal_ctx_release(opal_common_ucx_ctx_t *ctx); +static inline _tlocal_mem_t *_tlocal_search_mem(_tlocal_table_t *tls, + int mem_id); +static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls, + opal_common_ucx_mem_t *mem); +static int _tlocal_mem_create_rkey(_tlocal_mem_t *mem_rec, ucp_ep_h ep, + int target); + /***********************************************************************/ extern mca_base_framework_t opal_memory_base_framework; @@ -320,25 +347,25 @@ static ucp_worker_h _create_ctx_worker(opal_common_ucx_wpool_t *wpool) } static int _wpool_add_to_idle(opal_common_ucx_wpool_t *wpool, - _worker_info_t *wkr) + _worker_info_t *winfo) { _idle_list_item_t *item; - if(wkr->comm_size != 0) { - int i; - for (i = 0; i < wkr->comm_size; i++) { - ucp_ep_destroy(wkr->endpoints[i]); + if(winfo->comm_size != 0) { + size_t i; + for (i = 0; i < winfo->comm_size; i++) { + ucp_ep_destroy(winfo->endpoints[i]); } - free(wkr->endpoints); - wkr->endpoints = NULL; - wkr->comm_size = 0; + free(winfo->endpoints); + winfo->endpoints = NULL; + winfo->comm_size = 0; } item = OBJ_NEW(_idle_list_item_t); if (NULL == item) { return OPAL_ERR_OUT_OF_RESOURCE; } - item->ptr = wkr; + item->ptr = winfo; opal_mutex_lock(&wpool->mutex); opal_list_append(&wpool->idle_workers, &item->super); @@ -388,11 +415,11 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, ucp_params_t context_params; _worker_info_t *wkr; ucs_status_t status; - int ret = OPAL_SUCCESS; + int rc = OPAL_SUCCESS; if (wpool->refcnt > 0) { wpool->refcnt++; - return ret; + return rc; } wpool->refcnt++; @@ -422,7 +449,7 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, ucp_config_release(config); if (UCS_OK != status) { MCA_COMMON_UCX_VERBOSE(1, "ucp_init failed: %d", status); - ret = OPAL_ERROR; + rc = OPAL_ERROR; goto err_ucp_init; } @@ -431,7 +458,7 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, wpool->recv_worker = _create_ctx_worker(wpool); if (wpool->recv_worker == NULL) { MCA_COMMON_UCX_VERBOSE(1, "_create_ctx_worker failed"); - ret = OPAL_ERROR; + rc = OPAL_ERROR; goto err_worker_create; } @@ -441,28 +468,33 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, wkr->endpoints = NULL; wkr->comm_size = 0; - _wpool_add_to_idle(wpool, wkr); - status = ucp_worker_get_address(wpool->recv_worker, &wpool->recv_waddr, &wpool->recv_waddr_len); if (status != UCS_OK) { MCA_COMMON_UCX_VERBOSE(1, "ucp_worker_get_address failed: %d", status); - ret = OPAL_ERROR; + rc = OPAL_ERROR; goto err_get_addr; } + rc = _wpool_add_to_idle(wpool, wkr); + if (rc) { + goto err_wpool_add; + } + pthread_key_create(&_tlocal_key, _cleanup_tlocal); - return ret; + return rc; - err_get_addr: +err_wpool_add: + free(wpool->recv_waddr); +err_get_addr: if (NULL != wpool->recv_worker) { ucp_worker_destroy(wpool->recv_worker); } err_worker_create: ucp_cleanup(wpool->ucp_ctx); err_ucp_init: - return ret; + return rc; } OPAL_DECLSPEC void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) @@ -531,7 +563,7 @@ OPAL_DECLSPEC int opal_common_ucx_ctx_create(opal_common_ucx_wpool_t *wpool, int return ret; } -static int _common_ucx_ctx_free(opal_common_ucx_ctx_t *ctx) +static void _common_ucx_ctx_free(opal_common_ucx_ctx_t *ctx) { free(ctx->recv_worker_addrs); free(ctx->recv_worker_displs); @@ -540,17 +572,33 @@ static int _common_ucx_ctx_free(opal_common_ucx_ctx_t *ctx) free(ctx); } -OPAL_DECLSPEC int opal_common_ucx_ctx_release(opal_common_ucx_ctx_t *ctx) +OPAL_DECLSPEC void +opal_common_ucx_ctx_release(opal_common_ucx_ctx_t *ctx) { // TODO: implement _tlocal_ctx_release(ctx); } +static int +_common_ucx_ctx_append(opal_common_ucx_ctx_t *ctx, + _tlocal_ctx_t *ctx_rec) +{ + _worker_list_item_t *item = OBJ_NEW(_worker_list_item_t); + if (NULL == item) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + item->ptr = ctx_rec; + opal_mutex_lock(&ctx->mutex); + opal_list_append(&ctx->workers, &item->super); + opal_mutex_unlock(&ctx->mutex); + return OPAL_SUCCESS; +} + static void -_common_ucx_ctx_remove(opal_common_ucx_ctx_t *ctx, _worker_info_t *ctx_rec) +_common_ucx_ctx_remove(opal_common_ucx_ctx_t *ctx, _tlocal_ctx_t *ctx_rec) { int can_free = 0; - _worker_list_item_t *item = NULL, next; + _worker_list_item_t *item = NULL, *next; opal_mutex_lock(&ctx->mutex); OPAL_LIST_FOREACH_SAFE(item, next, &ctx->workers, _worker_list_item_t) { @@ -560,15 +608,14 @@ _common_ucx_ctx_remove(opal_common_ucx_ctx_t *ctx, _worker_info_t *ctx_rec) break; } } - if (0 == opal_list_get_size()) { + if (0 == opal_list_get_size(&ctx->workers)) { can_free = 1; } opal_mutex_unlock(&ctx->mutex); if (can_free) { /* All references to this data structure are removed - * we can safely release communication context structure - */ + * we can safely release communication context structure */ _common_ucx_ctx_free(ctx); } } @@ -638,9 +685,8 @@ OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int com mem->mem_id = OPAL_ATOMIC_ADD_FETCH32(&mem->mem_id, 1); OBJ_CONSTRUCT(&mem->mutex, opal_mutex_t); - OBJ_CONSTRUCT(&mem->mem_regions, opal_list_t); + OBJ_CONSTRUCT(&mem->registrations, opal_list_t); mem->ctx = ctx; - mem->comm_size = comm_size; mem->mem_addrs = NULL; mem->mem_displs = NULL; @@ -672,47 +718,66 @@ OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int com ucp_mem_unmap(ctx->wpool->ucp_ctx, mem->memh); error_mem_map: OBJ_DESTRUCT(&mem->mutex); - OBJ_DESTRUCT(&mem->mem_regions); + OBJ_DESTRUCT(&mem->registrations); free(mem); (*mem_ptr) = NULL; return ret; } -static int -_tlocal_extend_ctxtbl(_tlocal_table_t *tbl, size_t append) +static void _common_ucx_mem_free(opal_common_ucx_mem_t *mem) { - size_t i; - size_t newsize = (tbl->ctx_tbl_size + append); - tbl->ctx_tbl = realloc(tbl->ctx_tbl, newsize * sizeof(*tbl->ctx_tbl)); - for (i = tbl->ctx_tbl_size; i < newsize; i++) { - tbl->ctx_tbl[i] = calloc(1, sizeof(*tbl->ctx_tbl[i])); - if (NULL == tbl->ctx_tbl[i]) { - return OPAL_ERR_OUT_OF_RESOURCE; - } + free(mem->mem_addrs); + free(mem->mem_displs); + ucp_mem_unmap(mem->ctx->wpool->ucp_ctx, mem->memh); + OBJ_DESTRUCT(&mem->mutex); + OBJ_DESTRUCT(&mem->registrations); + free(mem); +} +static int +_common_ucx_mem_append(opal_common_ucx_mem_t *mem, + _tlocal_mem_t *mem_rec) +{ + _mem_region_list_item_t *item = OBJ_NEW(_mem_region_list_item_t); + if (NULL == item) { + return OPAL_ERR_OUT_OF_RESOURCE; } - tbl->ctx_tbl_size = newsize; + item->ptr = mem_rec; + opal_mutex_lock(&mem->mutex); + opal_list_append(&mem->registrations, &item->super); + opal_mutex_unlock(&mem->mutex); return OPAL_SUCCESS; } -static int -_tlocal_extend_memtbl(_tlocal_table_t *tbl, size_t append) + +static void +_common_ucx_mem_remove(opal_common_ucx_mem_t *mem, _tlocal_mem_t *mem_rec) { - size_t i; - size_t newsize = (tbl->ctx_tbl_size + append); + int can_free = 0; + _mem_region_list_item_t *item = NULL, *next; - tbl->mem_tbl = realloc(tbl->mem_tbl, newsize * sizeof(*tbl->mem_tbl)); - for (i = tbl->mem_tbl_size; i < tbl->mem_tbl_size + append; i++) { - tbl->mem_tbl[i] = calloc(1, sizeof(*tbl->mem_tbl[i])); - if (NULL == tbl->mem_tbl[i]) { - return OPAL_ERR_OUT_OF_RESOURCE; + opal_mutex_lock(&mem->mutex); + OPAL_LIST_FOREACH_SAFE(item, next, &mem->registrations, _mem_region_list_item_t) { + if (mem_rec == item->ptr) { + opal_list_remove_item(&mem->registrations, &item->super); + OBJ_RELEASE(item); + break; } } - tbl->mem_tbl_size = newsize; - return OPAL_SUCCESS; + if (0 == opal_list_get_size(&mem->registrations)) { + can_free = 1; + } + opal_mutex_unlock(&mem->mutex); + + if (can_free) { + /* All references to this data structure are removed + * we can safely release communication context structure */ + _common_ucx_mem_free(mem); + } } + // TODO: don't want to inline this function -static _tlocal_table_t* _common_ucx_init_tls(opal_common_ucx_wpool_t *wpool) +static _tlocal_table_t* _common_ucx_tls_init(opal_common_ucx_wpool_t *wpool) { _tlocal_table_t *tls = NULL; tls = OBJ_NEW(_tlocal_table_t); @@ -725,28 +790,95 @@ static _tlocal_table_t* _common_ucx_init_tls(opal_common_ucx_wpool_t *wpool) opal_list_append(&wpool->tls_list, &tls->super); opal_mutex_unlock(&wpool->mutex); - if( _tlocal_extend_ctxtbl(tls, 4) ){ + if( _tlocal_tls_ctxtbl_extend(tls, 4) ){ // TODO: handle error } - if(_tlocal_extend_memtbl(tls, 4)) { + if(_tlocal_tls_memtbl_extend(tls, 4)) { // TODO: handle error } pthread_setspecific(_tlocal_key, tls); return tls; } -static inline _worker_info_t *_tlocal_search_ctx(_tlocal_table_t *tls, int ctx_id) +static inline _tlocal_table_t * +_tlocal_get_tls(opal_common_ucx_wpool_t *wpool){ + _tlocal_table_t *tls = pthread_getspecific(_tlocal_key); + if( OPAL_UNLIKELY(NULL == tls) ) { + tls = _common_ucx_tls_init(wpool); + } + return tls; +} + +static int +_tlocal_tls_get_worker(_tlocal_table_t *tls, _worker_info_t **_winfo) +{ + _worker_info_t *winfo; + *_winfo = NULL; + winfo = _wpool_remove_from_idle(tls->wpool); + if (!winfo) { + winfo = calloc(1, sizeof(*winfo)); + if (!winfo) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + OBJ_CONSTRUCT(&winfo->mutex, opal_mutex_t); + _create_ctx_worker(tls->wpool); + winfo->endpoints = NULL; + winfo->comm_size = 0; + opal_mutex_unlock(&tls->wpool->mutex); + } + *_winfo = winfo; + + return OPAL_SUCCESS; +} + +static int +_tlocal_tls_ctxtbl_extend(_tlocal_table_t *tbl, size_t append) +{ + size_t i; + size_t newsize = (tbl->ctx_tbl_size + append); + tbl->ctx_tbl = realloc(tbl->ctx_tbl, newsize * sizeof(*tbl->ctx_tbl)); + for (i = tbl->ctx_tbl_size; i < newsize; i++) { + tbl->ctx_tbl[i] = calloc(1, sizeof(*tbl->ctx_tbl[i])); + if (NULL == tbl->ctx_tbl[i]) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + } + tbl->ctx_tbl_size = newsize; + return OPAL_SUCCESS; +} +static int +_tlocal_tls_memtbl_extend(_tlocal_table_t *tbl, size_t append) +{ + size_t i; + size_t newsize = (tbl->ctx_tbl_size + append); + + tbl->mem_tbl = realloc(tbl->mem_tbl, newsize * sizeof(*tbl->mem_tbl)); + for (i = tbl->mem_tbl_size; i < tbl->mem_tbl_size + append; i++) { + tbl->mem_tbl[i] = calloc(1, sizeof(*tbl->mem_tbl[i])); + if (NULL == tbl->mem_tbl[i]) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + } + tbl->mem_tbl_size = newsize; + return OPAL_SUCCESS; +} + + +static inline _tlocal_ctx_t * +_tlocal_ctx_search(_tlocal_table_t *tls, int ctx_id) { size_t i; for(i=0; ictx_tbl_size; i++) { if( tls->ctx_tbl[i]->ctx_id == ctx_id){ - return tls->ctx_tbl[i]->winfo; + return tls->ctx_tbl[i]; } } return NULL; } -static int _tlocal_cleanup_ctx_record(_tlocal_ctx_t *ctx_rec) +static int +_tlocal_cleanup_ctx_record(_tlocal_ctx_t *ctx_rec) { int rc; if (!ctx_rec->is_freed) { @@ -755,10 +887,7 @@ static int _tlocal_cleanup_ctx_record(_tlocal_ctx_t *ctx_rec) /* Remove myself from the communication context structure * This may result in context release as we are using * delayed cleanup */ - rc = _common_ucx_ctx_remove(ctx_rec->gctx, ctx_rec); - if (rc) { - return rc; - } + _common_ucx_ctx_remove(ctx_rec->gctx, ctx_rec); /* Return the worker back to the * This may result in context release as we are using @@ -768,19 +897,24 @@ static int _tlocal_cleanup_ctx_record(_tlocal_ctx_t *ctx_rec) return rc; } memset(ctx_rec, 0, sizeof(*ctx_rec)); + return OPAL_SUCCESS; } // TODO: Don't want to inline this (slow path) -static _worker_info_t *_tlocal_add_ctx(_tlocal_table_t *tls, - opal_common_ucx_ctx_t *ctx) +static _tlocal_ctx_t * +_tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx) { size_t i; int rc; /* Try to find tavailable spot in the table */ for (i=0; ictx_tbl_size; i++) { - if (tls->ctx_tbl[i]->is_freed) { - /* Cleanup the record */ + if (0 == tls->ctx_tbl[i]->ctx_id) { + /* Found clean record */ + break; + } + if (tls->ctx_tbl[i]->is_freed ) { + /* Found dirty record, need to clean first */ _tlocal_cleanup_ctx_record(tls->ctx_tbl[i]); break; } @@ -788,14 +922,22 @@ static _worker_info_t *_tlocal_add_ctx(_tlocal_table_t *tls, if( tls->ctx_tbl_size >= i ){ i = tls->ctx_tbl_size; - if( rc = _tlocal_extend_ctxtbl(tls, 4) ){ + rc = _tlocal_tls_ctxtbl_extend(tls, 4); + if (rc) { //TODO: error out return NULL; } } tls->ctx_tbl[i]->ctx_id = ctx->ctx_id; tls->ctx_tbl[i]->gctx = ctx; - tls->ctx_tbl[i]->winfo = _get_new_worker(tls, ctx); + rc = _tlocal_tls_get_worker(tls, &tls->ctx_tbl[i]->winfo); + if (rc) { + //TODO: error out + return NULL; + } + tls->ctx_tbl[i]->winfo->endpoints = calloc(ctx->comm_size, sizeof(ucp_ep_h)); + tls->ctx_tbl[i]->winfo->comm_size = ctx->comm_size; + /* Make sure that we completed all the data structures before * placing the item to the list @@ -804,63 +946,127 @@ static _worker_info_t *_tlocal_add_ctx(_tlocal_table_t *tls, */ opal_atomic_wmb(); - _ctx_append_worker(ctx, tls->ctx_tbl[i]); + /* add this worker into the context list */ + rc = _common_ucx_ctx_append(ctx, tls->ctx_tbl[i]); + if (rc) { + //TODO: error out + return NULL; + } + /* All good - return the record */ return tls->ctx_tbl[i]; } -static int _tlocal_ctx_connect(_tlocal_ctx_t *ctx, int target) +static int _tlocal_ctx_connect(_tlocal_ctx_t *ctx_rec, int target) { ucp_ep_params_t ep_params; - _worker_info_t *winfo = ctx->winfo; - opal_common_ucx_ctx_t *gctx = ctx->gctx; + _worker_info_t *winfo = ctx_rec->winfo; + opal_common_ucx_ctx_t *gctx = ctx_rec->gctx; + ucs_status_t status; int displ; memset(&ep_params, 0, sizeof(ucp_ep_params_t)); ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; - opal_mutex_lock(&winfo->ctx->mutex); + opal_mutex_lock(&winfo->mutex); displ = gctx->recv_worker_displs[target]; - ep_params.address = (ucp_address_t *)&(gctx->recv_worker_addrs[disp]); + ep_params.address = (ucp_address_t *)&(gctx->recv_worker_addrs[displ]); status = ucp_ep_create(winfo->worker, &ep_params, &winfo->endpoints[target]); + opal_mutex_lock(&winfo->mutex); + if (status != UCS_OK) { // TODO: error out here // OSC_UCX_VERBOSE(1, "ucp_ep_create failed: %d", status); - ret = OPAL_ERROR; + return OPAL_ERROR; } return OPAL_SUCCESS; } -static inline _worker_info_t * +static int _tlocal_ctx_release(opal_common_ucx_ctx_t *ctx) +{ + _tlocal_table_t * tls = _tlocal_get_tls(ctx->wpool); + _tlocal_ctx_t *ctx_rec = _tlocal_ctx_search(tls, ctx->ctx_id); + int rc = OPAL_SUCCESS; + + if (NULL == ctx_rec) { + /* we haven't participated in this context */ + return OPAL_SUCCESS; + } + + /* May free the ctx structure. Do not use it */ + _common_ucx_ctx_remove(ctx, ctx_rec); + rc = _wpool_add_to_idle(tls->wpool, ctx_rec->winfo); + + ctx_rec->ctx_id = 0; + ctx_rec->is_freed = 0; + ctx_rec->gctx = NULL; + ctx_rec->winfo = NULL; + + return rc; +} + +static inline _tlocal_mem_t * _tlocal_search_mem(_tlocal_table_t *tls, int mem_id) { - int i; + size_t i; for(i=0; imem_tbl_size; i++) { - if( tls->mem_tbl[i] == mem_id){ - return tls->mem_tbl[i]->mem; + if( tls->mem_tbl[i]->mem_id == mem_id){ + return tls->mem_tbl[i]; } } return NULL; } + +static void +_tlocal_mem_record_cleanup(_tlocal_mem_t *mem_rec) +{ + size_t i; + if (!mem_rec->is_freed) { + return; + } + /* Remove myself from the memory context structure + * This may result in context release as we are using + * delayed cleanup */ + _common_ucx_mem_remove(mem_rec->gmem, mem_rec); + + for(i = 0; i < mem_rec->gmem->ctx->comm_size; i++) { + if (mem_rec->mem->rkeys[i]) { + ucp_rkey_destroy(mem_rec->mem->rkeys[i]); + } + } + + free(mem_rec->mem->rkeys); + free(mem_rec->mem); + + memset(mem_rec, 0, sizeof(*mem_rec)); +} + + // TODO: Don't want to inline this (slow path) -static _worker_info_t *_tlocal_add_mem(_tlocal_table_t *tls, - opal_common_ucx_ctx_t *mem) +static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls, + opal_common_ucx_mem_t *mem) { - int i, rc; + size_t i; + _tlocal_ctx_t *ctx_rec = NULL; + int rc; /* Try to find tavailable spot in the table */ for (i=0; imem_tbl_size; i++) { + if (0 == tls->mem_tbl[i]->mem_id) { + /* Found a clear record */ + } if (tls->mem_tbl[i]->is_freed) { - /* Cleanup the record */ - _tlocal_cleanup_mem_record(tls->mem_tbl[i]); + /* Found a dirty record. Need to clean it first */ + _tlocal_mem_record_cleanup(tls->mem_tbl[i]); break; } } if( tls->mem_tbl_size >= i ){ i = tls->mem_tbl_size; - if( rc = _tlocal_extend_memtbl(tls, 4) ){ + rc = _tlocal_tls_memtbl_extend(tls, 4); + if (rc) { //TODO: error out return NULL; } @@ -868,7 +1074,16 @@ static _worker_info_t *_tlocal_add_mem(_tlocal_table_t *tls, tls->mem_tbl[i]->mem_id = mem->mem_id; tls->mem_tbl[i]->gmem = mem; tls->mem_tbl[i]->is_freed = 0; - tls->mem_tbl[i]->mem = _get_new_memory(tls, mem); + tls->mem_tbl[i]->mem = calloc(1, sizeof(*tls->mem_tbl[i]->mem)); + ctx_rec = _tlocal_ctx_search(tls, mem->ctx->ctx_id); + if (NULL == ctx_rec) { + // TODO: act accordingly - cleanup + return NULL; + } + tls->mem_tbl[i]->mem->worker = ctx_rec->winfo; + tls->mem_tbl[i]->mem->rkeys = calloc(mem->ctx->comm_size, + sizeof(*tls->mem_tbl[i]->mem->rkeys)); + /* Make sure that we completed all the data structures before * placing the item to the list @@ -877,7 +1092,11 @@ static _worker_info_t *_tlocal_add_mem(_tlocal_table_t *tls, */ opal_atomic_wmb(); - _mem_append_rkey(mem, tls->mem_tbl[i]); + rc = _common_ucx_mem_append(mem, tls->mem_tbl[i]); + if (rc) { + // TODO: error handling + return NULL; + } return tls->mem_tbl[i]; } @@ -887,17 +1106,20 @@ static int _tlocal_mem_create_rkey(_tlocal_mem_t *mem_rec, ucp_ep_h ep, int targ _mem_info_t *minfo = mem_rec->mem; opal_common_ucx_mem_t *gmem = mem_rec->gmem; int displ = gmem->mem_displs[target]; + ucs_status_t status; status = ucp_ep_rkey_unpack(ep, &gmem->mem_addrs[displ], &minfo->rkeys[target]); if (status != UCS_OK) { // TODO: error out here // OSC_UCX_VERBOSE(1, "ucp_ep_create failed: %d", status); - ret = OPAL_ERROR; + return OPAL_ERROR; } return OPAL_SUCCESS; } + + OPAL_DECLSPEC int opal_common_ucx_mem_op(opal_common_ucx_mem_t *mem, opal_common_ucx_op_t op, int target, @@ -905,46 +1127,57 @@ OPAL_DECLSPEC int opal_common_ucx_mem_op(opal_common_ucx_mem_t *mem, uint64_t rem_addr) { _tlocal_table_t *tls = NULL; - _worker_info_t *worker_info; + _tlocal_ctx_t *ctx_rec; + _worker_info_t *winfo; + _tlocal_mem_t *mem_rec; _mem_info_t *mem_info; ucp_ep_h ep; ucp_rkey_h rkey; + ucs_status_t status; + int rc; - tls = pthread_get_specific(_tlocal_key); - if( OPAL_UNLIKELY(NULL == tls) ) { - tls = _common_ucx_init_tls(mem->ctx->wpool); - } + tls = _tlocal_get_tls(mem->ctx->wpool); /* Obtain the worker structure */ - worker_info = _tlocal_search_ctx(tls, mem->ctx->ctx_id); - if (OPAL_UNLIKELY(NULL == worker_info)) { - worker_info = _tlocal_add_ctx(tls, mem->ctx); + ctx_rec = _tlocal_ctx_search(tls, mem->ctx->ctx_id); + if (OPAL_UNLIKELY(NULL == ctx_rec)) { + ctx_rec = _tlocal_add_ctx(tls, mem->ctx); + if (NULL == ctx_rec) { + // TODO: err handling + } } + winfo = ctx_rec->winfo; /* Obtain the endpoint */ - if (OPAL_UNLIKELY(NULL == worker_info->endpoints[target])) { - if (rc = _tlocal_ctx_connect(worker_info, target)) { + if (OPAL_UNLIKELY(NULL == winfo->endpoints[target])) { + rc = _tlocal_ctx_connect(ctx_rec, target); + if (rc) { return rc; } } - ep = worker_info->endpoints[target]; + ep = winfo->endpoints[target]; /* Obtain the memory region info */ - mem_info = _tlocal_search_mem(tls, mem->mem_id); - if (OPAL_UNLIKELY(mem_info == NULL)) { - mem_info = _tlocal_add_mem(tls, mem->mem_id); + mem_rec = _tlocal_search_mem(tls, mem->mem_id); + if (OPAL_UNLIKELY(mem_rec == NULL)) { + mem_rec = _tlocal_add_mem(tls, mem); + if (NULL == mem_rec) { + // TODO: err handling + } } + mem_info = mem_rec->mem; /* Obtain the rkey */ if (OPAL_UNLIKELY(NULL == mem_info->rkeys[target])) { // Create the rkey - if (rc = _tlocal_mem_rkey_create(mem_info, target)) { + rc = _tlocal_mem_create_rkey(mem_rec, ep, target); + if (rc) { return rc; } } rkey = mem_info->rkeys[target]; /* Perform the operation */ - opal_mutex_lock(worker_info->mutex); + opal_mutex_lock(&winfo->mutex); switch(op){ case OPAL_COMMON_UCX_GET: status = ucp_put_nbi(ep, buffer,len, rem_addr, rkey); @@ -963,26 +1196,33 @@ OPAL_DECLSPEC int opal_common_ucx_mem_op(opal_common_ucx_mem_t *mem, } break; } - opal_mutex_unlock(worker_info->mutex); + opal_mutex_unlock(&winfo->mutex); + return OPAL_SUCCESS; } -int opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, - opal_common_ucx_flush_scope_t scope, - int target) +OPAL_DECLSPEC void +opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, + opal_common_ucx_flush_scope_t scope, + int target) { _worker_list_item_t *item; - opal_mutex_lock(&mem->ctx->mutex); + opal_common_ucx_ctx_t *ctx = mem->ctx; + opal_mutex_lock(&ctx->mutex); OPAL_LIST_FOREACH(item, &ctx->workers, _worker_list_item_t) { switch (scope) { case OPAL_COMMON_UCX_SCOPE_WORKER: + opal_mutex_lock(&item->ptr->winfo->mutex); opal_common_ucx_worker_flush(item->ptr->winfo->worker); + opal_mutex_unlock(&item->ptr->winfo->mutex); break; case OPAL_COMMON_UCX_SCOPE_EP: if (NULL != item->ptr->winfo->endpoints[target] ) { + opal_mutex_lock(&item->ptr->winfo->mutex); opal_common_ucx_ep_flush(item->ptr->winfo->endpoints[target], item->ptr->winfo->worker); + opal_mutex_unlock(&item->ptr->winfo->mutex); } } } - opal_mutex_unlock(&mem->ctx->mutex); + opal_mutex_unlock(&ctx->mutex); } diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index e7b9bf0299f..7bca0e3aefb 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -115,7 +115,7 @@ typedef struct { opal_list_t workers; /* active worker lists */ char *recv_worker_addrs; int *recv_worker_displs; - int comm_size; + size_t comm_size; } opal_common_ucx_ctx_t; typedef struct { @@ -123,10 +123,9 @@ typedef struct { opal_mutex_t mutex; opal_common_ucx_ctx_t *ctx; /* which ctx this mem_reg belongs to */ ucp_mem_h memh; - opal_list_t mem_regions; /* mem region lists */ + opal_list_t registrations; /* mem region lists */ char *mem_addrs; int *mem_displs; - int comm_size; } opal_common_ucx_mem_t; typedef enum { @@ -159,6 +158,7 @@ OPAL_DECLSPEC int opal_common_ucx_ctx_create(opal_common_ucx_wpool_t *wpool, int opal_common_ucx_exchange_func_t exchange_func, void *exchange_metadata, opal_common_ucx_ctx_t **ctx_ptr); +OPAL_DECLSPEC void opal_common_ucx_ctx_release(opal_common_ucx_ctx_t *ctx); OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int comm_size, void **mem_base, size_t mem_size, opal_common_ucx_mem_type_t mem_type, @@ -167,6 +167,7 @@ OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int com opal_common_ucx_mem_t **mem_ptr); + OPAL_DECLSPEC void opal_common_ucx_mca_register(void); OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void); OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status); From a60c39fb3c3298e6504a68c80cd07af51a03d45e Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 12 Nov 2018 15:26:00 -0800 Subject: [PATCH 14/59] common code change --- opal/mca/common/ucx/common_ucx.c | 4 ++++ opal/mca/common/ucx/common_ucx.h | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index d6bdae02f90..98e987d961e 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -1226,3 +1226,7 @@ opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, } opal_mutex_unlock(&ctx->mutex); } + +OPAL_DECLSPEC int opal_common_ucx_workers_progress(opal_common_ucx_wpool_t *wpool) { + // TODO +} diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 7bca0e3aefb..230cd7aa8ad 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -165,7 +165,7 @@ OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int com opal_common_ucx_exchange_func_t exchange_func, void *exchange_metadata, opal_common_ucx_mem_t **mem_ptr); - +OPAL_DECLSPEC int opal_common_ucx_workers_progress(opal_common_ucx_wpool_t *wpool); OPAL_DECLSPEC void opal_common_ucx_mca_register(void); From 1e828090650d23f2c7d15437d91be99e2a5524dd Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 12 Nov 2018 14:31:14 -0800 Subject: [PATCH 15/59] osc changes --- ompi/mca/osc/ucx/osc_ucx.h | 18 +- ompi/mca/osc/ucx/osc_ucx_component.c | 381 ++++----------------------- 2 files changed, 64 insertions(+), 335 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx.h b/ompi/mca/osc/ucx/osc_ucx.h index 44dff95a845..7b2b97e910b 100644 --- a/ompi/mca/osc/ucx/osc_ucx.h +++ b/ompi/mca/osc/ucx/osc_ucx.h @@ -24,16 +24,9 @@ #define OMPI_OSC_UCX_ATTACH_MAX 32 #define OMPI_OSC_UCX_RKEY_BUF_MAX 1024 -typedef struct ompi_osc_ucx_win_info { - ucp_rkey_h rkey; - uint64_t addr; - bool rkey_init; -} ompi_osc_ucx_win_info_t; - typedef struct ompi_osc_ucx_component { ompi_osc_base_component_t super; - ucp_context_h ucp_context; - ucp_worker_h ucp_worker; + opal_common_ucx_wpool_t *wpool; bool enable_mpi_threads; opal_free_list_t requests; /* request free list for the r* communication variants */ bool env_initialized; /* UCX environment is initialized or not */ @@ -97,12 +90,10 @@ typedef struct ompi_osc_ucx_state { typedef struct ompi_osc_ucx_module { ompi_osc_base_module_t super; struct ompi_communicator_t *comm; - ucp_mem_h memh; /* remote accessible memory */ int flavor; size_t size; - ucp_mem_h state_memh; - ompi_osc_ucx_win_info_t *win_info_array; - ompi_osc_ucx_win_info_t *state_info_array; + uint64_t *addrs; + uint64_t *state_addrs; int disp_unit; /* if disp_unit >= 0, then everyone has the same * disp unit size; if disp_unit == -1, then we * need to look at disp_units */ @@ -122,6 +113,9 @@ typedef struct ompi_osc_ucx_module { uint64_t req_result; int *start_grp_ranks; bool lock_all_is_nocheck; + opal_common_ucx_ctx_t *ctx; + opal_common_ucx_mem_t *mem; + opal_common_ucx_mem_t *state_mem; } ompi_osc_ucx_module_t; typedef enum locktype { diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index 6fd3291bad0..02f975dbf24 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -54,8 +54,7 @@ ompi_osc_ucx_component_t mca_osc_ucx_component = { .osc_select = component_select, .osc_finalize = component_finalize, }, - .ucp_context = NULL, - .ucp_worker = NULL, + .wpool = NULL, .env_initialized = false, .num_incomplete_req_ops = 0, .num_modules = 0 @@ -120,37 +119,22 @@ static int component_register(void) { } static int progress_callback(void) { - ucp_worker_progress(mca_osc_ucx_component.ucp_worker); + if (mca_osc_ucx_component.wpool != NULL) { + opal_common_ucx_workers_progress(mca_osc_ucx_component.wpool); + } return 0; } static int component_init(bool enable_progress_threads, bool enable_mpi_threads) { mca_osc_ucx_component.enable_mpi_threads = enable_mpi_threads; - + mca_osc_ucx_component.wpool = opal_common_ucx_wpool_allocate(); opal_common_ucx_mca_register(); return OMPI_SUCCESS; } static int component_finalize(void) { - int i; - for (i = 0; i < ompi_proc_world_size(); i++) { - ucp_ep_h ep = OSC_UCX_GET_EP(&(ompi_mpi_comm_world.comm), i); - if (ep != NULL) { - ucp_ep_destroy(ep); - } - } - - if (mca_osc_ucx_component.ucp_worker != NULL) { - ucp_worker_destroy(mca_osc_ucx_component.ucp_worker); - } - - assert(mca_osc_ucx_component.num_incomplete_req_ops == 0); - if (mca_osc_ucx_component.env_initialized == true) { - OBJ_DESTRUCT(&mca_osc_ucx_component.requests); - ucp_cleanup(mca_osc_ucx_component.ucp_context); - mca_osc_ucx_component.env_initialized = false; - } opal_common_ucx_mca_deregister(); + opal_common_ucx_wpool_free(mca_osc_ucx_component.wpool); return OMPI_SUCCESS; } @@ -160,9 +144,11 @@ static int component_query(struct ompi_win_t *win, void **base, size_t size, int return mca_osc_ucx_component.priority; } -static inline int allgather_len_and_info(void *my_info, int my_info_len, char **recv_info, - int *disps, struct ompi_communicator_t *comm) { +static int exchange_len_info(void *my_info, size_t my_info_len, char **recv_info_ptr, + int **disps_ptr, void *metadata) +{ int ret = OMPI_SUCCESS; + struct ompi_communicator_t *comm = (struct ompi_communicator_t *)metadata; int comm_size = ompi_comm_size(comm); int lens[comm_size]; int total_len, i; @@ -175,15 +161,15 @@ static inline int allgather_len_and_info(void *my_info, int my_info_len, char ** } total_len = 0; + (*disps_ptr) = (int *)calloc(comm_size, sizeof(int)); for (i = 0; i < comm_size; i++) { - disps[i] = total_len; + (*disps_ptr)[i] = total_len; total_len += lens[i]; } - (*recv_info) = (char *)malloc(total_len); - + (*recv_info_ptr) = (char *)calloc(total_len, sizeof(char)); ret = comm->c_coll->coll_allgatherv(my_info, my_info_len, MPI_BYTE, - (void *)(*recv_info), lens, disps, MPI_BYTE, + (void *)(*recv_info_ptr), lens, (*disps_ptr), MPI_BYTE, comm, comm->c_coll->coll_allgatherv_module); if (OMPI_SUCCESS != ret) { return ret; @@ -192,60 +178,6 @@ static inline int allgather_len_and_info(void *my_info, int my_info_len, char ** return ret; } -static inline int mem_map(void **base, size_t size, ucp_mem_h *memh_ptr, - ompi_osc_ucx_module_t *module, int flavor) { - ucp_mem_map_params_t mem_params; - ucp_mem_attr_t mem_attrs; - ucs_status_t status; - int ret = OMPI_SUCCESS; - - if (!(flavor == MPI_WIN_FLAVOR_ALLOCATE || flavor == MPI_WIN_FLAVOR_CREATE) - || size == 0) { - return ret; - } - - memset(&mem_params, 0, sizeof(ucp_mem_map_params_t)); - mem_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | - UCP_MEM_MAP_PARAM_FIELD_LENGTH | - UCP_MEM_MAP_PARAM_FIELD_FLAGS; - mem_params.length = size; - if (flavor == MPI_WIN_FLAVOR_ALLOCATE) { - mem_params.address = NULL; - mem_params.flags = UCP_MEM_MAP_ALLOCATE; - } else { - mem_params.address = (*base); - } - - /* memory map */ - - status = ucp_mem_map(mca_osc_ucx_component.ucp_context, &mem_params, memh_ptr); - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_mem_map failed: %d", status); - ret = OMPI_ERROR; - goto error; - } - - mem_attrs.field_mask = UCP_MEM_ATTR_FIELD_ADDRESS | UCP_MEM_ATTR_FIELD_LENGTH; - status = ucp_mem_query((*memh_ptr), &mem_attrs); - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_mem_query failed: %d", status); - ret = OMPI_ERROR; - goto error; - } - - assert(mem_attrs.length >= size); - if (flavor == MPI_WIN_FLAVOR_CREATE) { - assert(mem_attrs.address == (*base)); - } else { - (*base) = mem_attrs.address; - } - - return ret; - error: - ucp_mem_unmap(mca_osc_ucx_component.ucp_context, (*memh_ptr)); - return ret; -} - static void ompi_osc_ucx_unregister_progress() { int ret; @@ -269,21 +201,12 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in int ret = OMPI_SUCCESS; ucs_status_t status; int i, comm_size = ompi_comm_size(comm); - int is_eps_ready; - bool eps_created = false, env_initialized = false; - ucp_address_t *my_addr = NULL; - size_t my_addr_len; - char *recv_buf = NULL; - void *rkey_buffer = NULL, *state_rkey_buffer = NULL; - size_t rkey_buffer_size, state_rkey_buffer_size; + bool env_initialized = false; void *state_base = NULL; - void * my_info = NULL; - size_t my_info_len; - int disps[comm_size]; - int rkey_sizes[comm_size]; + opal_common_ucx_mem_type_t mem_type; uint64_t zero = 0; - size_t info_offset; - uint64_t size_u64; + void * my_info = NULL; + char *recv_buf = NULL; /* the osc/sm component is the exclusive provider for support for * shared memory windows */ @@ -292,16 +215,6 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in } if (mca_osc_ucx_component.env_initialized == false) { - ucp_config_t *config = NULL; - ucp_params_t context_params; - ucp_worker_params_t worker_params; - ucp_worker_attr_t worker_attr; - - status = ucp_config_read("MPI", NULL, &config); - if (UCS_OK != status) { - OSC_UCX_VERBOSE(1, "ucp_config_read failed: %d", status); - return OMPI_ERROR; - } OBJ_CONSTRUCT(&mca_osc_ucx_component.requests, opal_free_list_t); ret = opal_free_list_init (&mca_osc_ucx_component.requests, @@ -314,57 +227,16 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in goto error; } - /* initialize UCP context */ - - memset(&context_params, 0, sizeof(context_params)); - context_params.field_mask = UCP_PARAM_FIELD_FEATURES | - UCP_PARAM_FIELD_MT_WORKERS_SHARED | - UCP_PARAM_FIELD_ESTIMATED_NUM_EPS | - UCP_PARAM_FIELD_REQUEST_INIT | - UCP_PARAM_FIELD_REQUEST_SIZE; - context_params.features = UCP_FEATURE_RMA | UCP_FEATURE_AMO32 | UCP_FEATURE_AMO64; - context_params.mt_workers_shared = 0; - context_params.estimated_num_eps = ompi_proc_world_size(); - context_params.request_init = internal_req_init; - context_params.request_size = sizeof(ompi_osc_ucx_internal_request_t); - - status = ucp_init(&context_params, config, &mca_osc_ucx_component.ucp_context); - ucp_config_release(config); - if (UCS_OK != status) { - OSC_UCX_VERBOSE(1, "ucp_init failed: %d", status); - ret = OMPI_ERROR; + ret = opal_common_ucx_wpool_init(mca_osc_ucx_component.wpool, + ompi_proc_world_size(), + internal_req_init, + sizeof(ompi_osc_ucx_internal_request_t), + mca_osc_ucx_component.enable_mpi_threads); + if (OMPI_SUCCESS != ret) { + OSC_UCX_VERBOSE(1, "opal_common_ucx_wpool_init failed: %d", ret); goto error; } - assert(mca_osc_ucx_component.ucp_worker == NULL); - memset(&worker_params, 0, sizeof(worker_params)); - worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; - worker_params.thread_mode = (mca_osc_ucx_component.enable_mpi_threads == true) - ? UCS_THREAD_MODE_MULTI : UCS_THREAD_MODE_SINGLE; - status = ucp_worker_create(mca_osc_ucx_component.ucp_context, &worker_params, - &(mca_osc_ucx_component.ucp_worker)); - if (UCS_OK != status) { - OSC_UCX_VERBOSE(1, "ucp_worker_create failed: %d", status); - ret = OMPI_ERROR; - goto error_nomem; - } - - /* query UCP worker attributes */ - worker_attr.field_mask = UCP_WORKER_ATTR_FIELD_THREAD_MODE; - status = ucp_worker_query(mca_osc_ucx_component.ucp_worker, &worker_attr); - if (UCS_OK != status) { - OSC_UCX_VERBOSE(1, "ucp_worker_query failed: %d", status); - ret = OMPI_ERROR; - goto error_nomem; - } - - if (mca_osc_ucx_component.enable_mpi_threads == true && - worker_attr.thread_mode != UCS_THREAD_MODE_MULTI) { - OSC_UCX_VERBOSE(1, "ucx does not support multithreading"); - ret = OMPI_ERROR; - goto error_nomem; - } - mca_osc_ucx_component.env_initialized = true; env_initialized = true; } @@ -425,187 +297,72 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in } } - /* exchange endpoints if necessary */ - is_eps_ready = 1; - for (i = 0; i < comm_size; i++) { - if (OSC_UCX_GET_EP(module->comm, i) == NULL) { - is_eps_ready = 0; - break; - } - } - - ret = module->comm->c_coll->coll_allreduce(MPI_IN_PLACE, &is_eps_ready, 1, MPI_INT, - MPI_LAND, - module->comm, - module->comm->c_coll->coll_allreduce_module); + ret = opal_common_ucx_ctx_create(mca_osc_ucx_component.wpool, comm_size, + &exchange_len_info, (void *)module->comm, + &module->ctx); if (OMPI_SUCCESS != ret) { goto error; } - if (!is_eps_ready) { - status = ucp_worker_get_address(mca_osc_ucx_component.ucp_worker, - &my_addr, &my_addr_len); - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_worker_get_address failed: %d", status); - ret = OMPI_ERROR; - goto error; + if (flavor == MPI_WIN_FLAVOR_ALLOCATE || flavor == MPI_WIN_FLAVOR_CREATE) { + swtich (flavor) { + case MPI_WIN_FLAVOR_ALLOCATE: + mem_type = OPAL_COMMON_UCX_MEM_ALLOCATE_MAP; + break; + case MPI_WIN_FLAVOR_CREATE: + mem_type = OPAL_COMMON_UCX_MEM_MAP; + break; } - ret = allgather_len_and_info(my_addr, (int)my_addr_len, - &recv_buf, disps, module->comm); + ret = opal_common_ucx_mem_create(module->ctx, comm_size, base, size, + mem_type, &exchange_len_info, + (void *)module->comm, &module->mem); if (ret != OMPI_SUCCESS) { goto error; } - for (i = 0; i < comm_size; i++) { - if (OSC_UCX_GET_EP(module->comm, i) == NULL) { - ucp_ep_params_t ep_params; - ucp_ep_h ep; - memset(&ep_params, 0, sizeof(ucp_ep_params_t)); - ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; - ep_params.address = (ucp_address_t *)&(recv_buf[disps[i]]); - status = ucp_ep_create(mca_osc_ucx_component.ucp_worker, &ep_params, &ep); - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_ep_create failed: %d", status); - ret = OMPI_ERROR; - goto error; - } - - ompi_comm_peer_lookup(module->comm, i)->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_UCX] = ep; - } - } - - ucp_worker_release_address(mca_osc_ucx_component.ucp_worker, my_addr); - my_addr = NULL; - free(recv_buf); - recv_buf = NULL; - - eps_created = true; - } - - ret = mem_map(base, size, &(module->memh), module, flavor); - if (ret != OMPI_SUCCESS) { - goto error; } state_base = (void *)&(module->state); - ret = mem_map(&state_base, sizeof(ompi_osc_ucx_state_t), &(module->state_memh), - module, MPI_WIN_FLAVOR_CREATE); + ret = opal_common_ucx_mem_create(module->ctx, comm_size, state_base, + sizeof(ompi_osc_ucx_state_t), + OPAL_COMMON_UCX_MEM_MAP, &exchange_len_info, + (void *)module->comm, &module->state_mem); if (ret != OMPI_SUCCESS) { goto error; } - module->win_info_array = calloc(comm_size, sizeof(ompi_osc_ucx_win_info_t)); - if (module->win_info_array == NULL) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto error; - } - - module->state_info_array = calloc(comm_size, sizeof(ompi_osc_ucx_win_info_t)); - if (module->state_info_array == NULL) { - ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; - goto error; - } - - if (size > 0 && (flavor == MPI_WIN_FLAVOR_ALLOCATE || flavor == MPI_WIN_FLAVOR_CREATE)) { - status = ucp_rkey_pack(mca_osc_ucx_component.ucp_context, module->memh, - &rkey_buffer, &rkey_buffer_size); - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_rkey_pack failed: %d", status); - ret = OMPI_ERROR; - goto error; - } - } else { - rkey_buffer_size = 0; - } - - status = ucp_rkey_pack(mca_osc_ucx_component.ucp_context, module->state_memh, - &state_rkey_buffer, &state_rkey_buffer_size); - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_rkey_pack failed: %d", status); - ret = OMPI_ERROR; - goto error; - } - - size_u64 = (uint64_t)size; - my_info_len = 3 * sizeof(uint64_t) + rkey_buffer_size + state_rkey_buffer_size; - my_info = malloc(my_info_len); + /* exchange window addrs */ + my_info = malloc(2 * sizeof(uint64_t)); if (my_info == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } - info_offset = 0; - if (flavor == MPI_WIN_FLAVOR_ALLOCATE || flavor == MPI_WIN_FLAVOR_CREATE) { - memcpy_off(my_info, base, sizeof(uint64_t), info_offset); + memcpy(my_info, base, sizeof(uint64_t)); } else { - memcpy_off(my_info, &zero, sizeof(uint64_t), info_offset); + memcpy(my_info, &zero, sizeof(uint64_t)); } - memcpy_off(my_info, &state_base, sizeof(uint64_t), info_offset); - memcpy_off(my_info, &size_u64, sizeof(uint64_t), info_offset); - memcpy_off(my_info, rkey_buffer, rkey_buffer_size, info_offset); - memcpy_off(my_info, state_rkey_buffer, state_rkey_buffer_size, info_offset); - - assert(my_info_len == info_offset); + memcpy(my_info + sizeof(uint64_t), &state_base, sizeof(uint64_t)); - ret = allgather_len_and_info(my_info, (int)my_info_len, &recv_buf, disps, module->comm); + recv_buf = (char *)calloc(comm_size, 2 * sizeof(uint64_t)); + ret = comm->c_coll->coll_allgather((void *)&my_info, 2 * sizeof(uint64_t), + MPI_BYTE, recv_buf, 2 * sizeof(uint64_t), + MPI_BYTE, comm, comm->c_coll->coll_allgather_module); if (ret != OMPI_SUCCESS) { goto error; } - ret = comm->c_coll->coll_allgather((void *)&rkey_buffer_size, 1, MPI_INT, - rkey_sizes, 1, MPI_INT, comm, - comm->c_coll->coll_allgather_module); - if (OMPI_SUCCESS != ret) { - goto error; - } - + module->addrs = calloc(comm_size, sizeof(uint64_t)); + module->state_addrs = calloc(comm_size, sizeof(uint64_t)); for (i = 0; i < comm_size; i++) { - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, i); - uint64_t dest_size; - assert(ep != NULL); - - info_offset = disps[i]; - - memcpy(&(module->win_info_array[i]).addr, &recv_buf[info_offset], sizeof(uint64_t)); - info_offset += sizeof(uint64_t); - memcpy(&(module->state_info_array[i]).addr, &recv_buf[info_offset], sizeof(uint64_t)); - info_offset += sizeof(uint64_t); - memcpy(&dest_size, &recv_buf[info_offset], sizeof(uint64_t)); - info_offset += sizeof(uint64_t); - - (module->win_info_array[i]).rkey_init = false; - if (dest_size > 0 && (flavor == MPI_WIN_FLAVOR_ALLOCATE || flavor == MPI_WIN_FLAVOR_CREATE)) { - status = ucp_ep_rkey_unpack(ep, &recv_buf[info_offset], - &((module->win_info_array[i]).rkey)); - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_ep_rkey_unpack failed: %d", status); - ret = OMPI_ERROR; - goto error; - } - info_offset += rkey_sizes[i]; - (module->win_info_array[i]).rkey_init = true; - } - - status = ucp_ep_rkey_unpack(ep, &recv_buf[info_offset], - &((module->state_info_array[i]).rkey)); - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_ep_rkey_unpack failed: %d", status); - ret = OMPI_ERROR; - goto error; - } - (module->state_info_array[i]).rkey_init = true; + memcpy(&(module->addrs[i]), recv_buf, sizeof(uint64_t)); + memcpy(&(module->state_addrs[i]), recv_buf + sizeof(uint64_t), sizeof(uint64_t)); } - - free(my_info); free(recv_buf); - if (rkey_buffer_size != 0) { - ucp_rkey_buffer_release(rkey_buffer); - } - ucp_rkey_buffer_release(state_rkey_buffer); - + /* init window state */ module->state.lock = TARGET_LOCK_UNLOCKED; module->state.post_index = 0; memset((void *)module->state.post_state, 0, sizeof(uint64_t) * OMPI_OSC_UCX_POST_PEER_MAX); @@ -655,30 +412,9 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in return ret; error: - if (my_addr) ucp_worker_release_address(mca_osc_ucx_component.ucp_worker, my_addr); - if (recv_buf) free(recv_buf); - if (my_info) free(my_info); - for (i = 0; i < comm_size; i++) { - if ((module->win_info_array[i]).rkey != NULL) { - ucp_rkey_destroy((module->win_info_array[i]).rkey); - } - if ((module->state_info_array[i]).rkey != NULL) { - ucp_rkey_destroy((module->state_info_array[i]).rkey); - } - } - if (rkey_buffer) ucp_rkey_buffer_release(rkey_buffer); - if (state_rkey_buffer) ucp_rkey_buffer_release(state_rkey_buffer); - if (module->win_info_array) free(module->win_info_array); - if (module->state_info_array) free(module->state_info_array); if (module->disp_units) free(module->disp_units); if (module->comm) ompi_comm_free(&module->comm); if (module->per_target_ops_nums) free(module->per_target_ops_nums); - if (eps_created) { - for (i = 0; i < comm_size; i++) { - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, i); - ucp_ep_destroy(ep); - } - } if (module) { free(module); ompi_osc_ucx_unregister_progress(); @@ -686,9 +422,8 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in error_nomem: if (env_initialized == true) { + opal_common_ucx_wpool_finalize(mca_osc_ucx_component.wpool); OBJ_DESTRUCT(&mca_osc_ucx_component.requests); - ucp_worker_destroy(mca_osc_ucx_component.ucp_worker); - ucp_cleanup(mca_osc_ucx_component.ucp_context); mca_osc_ucx_component.env_initialized = false; } return ret; From 64469486afb4a80fbba7150a41c91d0ded0ee31e Mon Sep 17 00:00:00 2001 From: "Artem Y. Polyakov" Date: Mon, 12 Nov 2018 18:02:20 -0800 Subject: [PATCH 16/59] OSC builds as well. --- ompi/mca/osc/ucx/osc_ucx_active_target.c | 89 +++++----- ompi/mca/osc/ucx/osc_ucx_comm.c | 159 +++++++++++------- ompi/mca/osc/ucx/osc_ucx_component.c | 25 ++- ompi/mca/osc/ucx/osc_ucx_passive_target.c | 188 ++++++++++++---------- opal/mca/common/ucx/common_ucx.c | 150 +++++++++++++++-- opal/mca/common/ucx/common_ucx.h | 25 +++ 6 files changed, 436 insertions(+), 200 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_active_target.c b/ompi/mca/osc/ucx/osc_ucx_active_target.c index 3c0a1488eec..c0271dfbfbe 100644 --- a/ompi/mca/osc/ucx/osc_ucx_active_target.c +++ b/ompi/mca/osc/ucx/osc_ucx_active_target.c @@ -59,6 +59,7 @@ static inline void ompi_osc_ucx_handle_incoming_post(ompi_osc_ucx_module_t *modu } int ompi_osc_ucx_fence(int assert, struct ompi_win_t *win) { + /* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; int ret; @@ -86,9 +87,13 @@ int ompi_osc_ucx_fence(int assert, struct ompi_win_t *win) { return module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module); + */ + return OMPI_SUCCESS; } int ompi_osc_ucx_start(struct ompi_group_t *group, int assert, struct ompi_win_t *win) { + + /* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; int i, size, *ranks_in_grp = NULL, *ranks_in_win_grp = NULL; ompi_group_t *win_group = NULL; @@ -126,7 +131,7 @@ int ompi_osc_ucx_start(struct ompi_group_t *group, int assert, struct ompi_win_t if ((assert & MPI_MODE_NOCHECK) == 0) { ompi_osc_ucx_pending_post_t *pending_post, *next; - /* first look through the pending list */ + // first look through the pending list OPAL_LIST_FOREACH_SAFE(pending_post, next, &module->pending_posts, ompi_osc_ucx_pending_post_t) { for (i = 0; i < size; i++) { if (pending_post->rank == ranks_in_win_grp[i]) { @@ -138,7 +143,7 @@ int ompi_osc_ucx_start(struct ompi_group_t *group, int assert, struct ompi_win_t } } - /* waiting for the rest post requests to come */ + // waiting for the rest post requests to come while (module->post_count != size) { for (i = 0; i < OMPI_OSC_UCX_POST_PEER_MAX; i++) { if (0 == module->state.post_state[i]) { @@ -159,9 +164,12 @@ int ompi_osc_ucx_start(struct ompi_group_t *group, int assert, struct ompi_win_t module->start_grp_ranks = ranks_in_win_grp; return ret; + */ + return OMPI_SUCCESS; } int ompi_osc_ucx_complete(struct ompi_win_t *win) { + /* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; ucs_status_t status; int i, size; @@ -183,7 +191,7 @@ int ompi_osc_ucx_complete(struct ompi_win_t *win) { size = ompi_group_size(module->start_group); for (i = 0; i < size; i++) { - uint64_t remote_addr = (module->state_info_array)[module->start_grp_ranks[i]].addr + OSC_UCX_STATE_COMPLETE_COUNT_OFFSET; /* write to state.complete_count on remote side */ + uint64_t remote_addr = (module->state_info_array)[module->start_grp_ranks[i]].addr + OSC_UCX_STATE_COMPLETE_COUNT_OFFSET; // write to state.complete_count on remote side ucp_rkey_h rkey = (module->state_info_array)[module->start_grp_ranks[i]].rkey; ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, module->start_grp_ranks[i]); @@ -204,9 +212,12 @@ int ompi_osc_ucx_complete(struct ompi_win_t *win) { free(module->start_grp_ranks); return ret; + */ + return OMPI_SUCCESS; } int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t *win) { + /* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; int ret = OMPI_SUCCESS; @@ -243,12 +254,12 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t } for (i = 0; i < size; i++) { - uint64_t remote_addr = (module->state_info_array)[ranks_in_win_grp[i]].addr + OSC_UCX_STATE_POST_INDEX_OFFSET; /* write to state.post_index on remote side */ + uint64_t remote_addr = (module->state_info_array)[ranks_in_win_grp[i]].addr + OSC_UCX_STATE_POST_INDEX_OFFSET; // write to state.post_index on remote side ucp_rkey_h rkey = (module->state_info_array)[ranks_in_win_grp[i]].rkey; ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, ranks_in_win_grp[i]); uint64_t curr_idx = 0, result = 0; - /* do fop first to get an post index */ + // do fop first to get an post index opal_common_ucx_atomic_fetch(ep, UCP_ATOMIC_FETCH_OP_FADD, 1, &result, sizeof(result), remote_addr, rkey, mca_osc_ucx_component.ucp_worker); @@ -257,7 +268,7 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t remote_addr = (module->state_info_array)[ranks_in_win_grp[i]].addr + OSC_UCX_STATE_POST_STATE_OFFSET + sizeof(uint64_t) * curr_idx; - /* do cas to send post message */ + // do cas to send post message do { opal_common_ucx_atomic_cswap(ep, 0, (uint64_t)myrank + 1, &result, sizeof(result), remote_addr, rkey, @@ -266,9 +277,9 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t if (result == 0) break; - /* prevent circular wait by checking for post messages received */ + // prevent circular wait by checking for post messages received for (j = 0; j < OMPI_OSC_UCX_POST_PEER_MAX; j++) { - /* no post at this index (yet) */ + // no post at this index (yet) if (0 == module->state.post_state[j]) { continue; } @@ -288,56 +299,58 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t module->epoch_type.exposure = POST_WAIT_EPOCH; return ret; + */ + return OMPI_SUCCESS; } int ompi_osc_ucx_wait(struct ompi_win_t *win) { - ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; - int size; +// ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; +// int size; - if (module->epoch_type.exposure != POST_WAIT_EPOCH) { - return OMPI_ERR_RMA_SYNC; - } +// if (module->epoch_type.exposure != POST_WAIT_EPOCH) { +// return OMPI_ERR_RMA_SYNC; +// } - size = ompi_group_size(module->post_group); +// size = ompi_group_size(module->post_group); - while (module->state.complete_count != (uint64_t)size) { - /* not sure if this is required */ - ucp_worker_progress(mca_osc_ucx_component.ucp_worker); - } +// while (module->state.complete_count != (uint64_t)size) { +// /* not sure if this is required */ +// ucp_worker_progress(mca_osc_ucx_component.ucp_worker); +// } - module->state.complete_count = 0; +// module->state.complete_count = 0; - OBJ_RELEASE(module->post_group); - module->post_group = NULL; +// OBJ_RELEASE(module->post_group); +// module->post_group = NULL; - module->epoch_type.exposure = NONE_EPOCH; +// module->epoch_type.exposure = NONE_EPOCH; return OMPI_SUCCESS; } int ompi_osc_ucx_test(struct ompi_win_t *win, int *flag) { - ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; - int size; +// ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; +// int size; - if (module->epoch_type.exposure != POST_WAIT_EPOCH) { - return OMPI_ERR_RMA_SYNC; - } +// if (module->epoch_type.exposure != POST_WAIT_EPOCH) { +// return OMPI_ERR_RMA_SYNC; +// } - size = ompi_group_size(module->post_group); +// size = ompi_group_size(module->post_group); - opal_progress(); +// opal_progress(); - if (module->state.complete_count == (uint64_t)size) { - OBJ_RELEASE(module->post_group); - module->post_group = NULL; +// if (module->state.complete_count == (uint64_t)size) { +// OBJ_RELEASE(module->post_group); +// module->post_group = NULL; - module->state.complete_count = 0; +// module->state.complete_count = 0; - module->epoch_type.exposure = NONE_EPOCH; - *flag = 1; - } else { - *flag = 0; - } +// module->epoch_type.exposure = NONE_EPOCH; +// *flag = 1; +// } else { +// *flag = 0; +// } return OMPI_SUCCESS; } diff --git a/ompi/mca/osc/ucx/osc_ucx_comm.c b/ompi/mca/osc/ucx/osc_ucx_comm.c index ec760d4fda3..7f93ef42218 100644 --- a/ompi/mca/osc/ucx/osc_ucx_comm.c +++ b/ompi/mca/osc/ucx/osc_ucx_comm.c @@ -66,17 +66,19 @@ static inline int check_sync_state(ompi_osc_ucx_module_t *module, int target, return OMPI_SUCCESS; } -static inline int incr_and_check_ops_num(ompi_osc_ucx_module_t *module, int target, - ucp_ep_h ep) { - int status; +static inline int incr_and_check_ops_num(ompi_osc_ucx_module_t *module, int target) { + //int status; module->global_ops_num++; module->per_target_ops_nums[target]++; if (module->global_ops_num >= OSC_UCX_OPS_THRESHOLD) { - status = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker); + // TODO: + /*status =*/ opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target); +/* if (status != OMPI_SUCCESS) { return status; } +*/ module->global_ops_num -= module->per_target_ops_nums[target]; module->per_target_ops_nums[target] = 0; } @@ -143,7 +145,7 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module, ucx_iovec_t *origin_ucx_iov = NULL, *target_ucx_iov = NULL; uint32_t origin_ucx_iov_count = 0, target_ucx_iov_count = 0; uint32_t origin_ucx_iov_idx = 0, target_ucx_iov_idx = 0; - ucs_status_t status; + int status; int ret = OMPI_SUCCESS; if (!is_origin_contig) { @@ -167,24 +169,20 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module, while (origin_ucx_iov_idx < origin_ucx_iov_count) { curr_len = MIN(origin_ucx_iov[origin_ucx_iov_idx].len, target_ucx_iov[target_ucx_iov_idx].len); - - if (!is_get) { - status = ucp_put_nbi(ep, origin_ucx_iov[origin_ucx_iov_idx].addr, curr_len, - remote_addr + (uint64_t)(target_ucx_iov[target_ucx_iov_idx].addr), rkey); - if (status != UCS_OK && status != UCS_INPROGRESS) { - OSC_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status); - return OMPI_ERROR; - } - } else { - status = ucp_get_nbi(ep, origin_ucx_iov[origin_ucx_iov_idx].addr, curr_len, - remote_addr + (uint64_t)(target_ucx_iov[target_ucx_iov_idx].addr), rkey); - if (status != UCS_OK && status != UCS_INPROGRESS) { - OSC_UCX_VERBOSE(1, "ucp_get_nbi failed: %d",status); - return OMPI_ERROR; - } + opal_common_ucx_op_t op = OPAL_COMMON_UCX_PUT; + if (is_get) { + op = OPAL_COMMON_UCX_GET; + } + status = opal_common_ucx_mem_putget(module->mem, op, + target, + origin_ucx_iov[origin_ucx_iov_idx].addr, curr_len, + remote_addr + (uint64_t)(target_ucx_iov[target_ucx_iov_idx].addr)); + if (OPAL_SUCCESS != status) { + OSC_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status); + return OMPI_ERROR; } - ret = incr_and_check_ops_num(module, target, ep); + ret = incr_and_check_ops_num(module, target); if (ret != OMPI_SUCCESS) { return ret; } @@ -206,6 +204,7 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module, target_ucx_iov_idx == target_ucx_iov_count); } else if (!is_origin_contig) { + /* size_t prev_len = 0; while (origin_ucx_iov_idx < origin_ucx_iov_count) { if (!is_get) { @@ -234,7 +233,9 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module, prev_len += origin_ucx_iov[origin_ucx_iov_idx].len; origin_ucx_iov_idx++; } + */ } else { + /* size_t prev_len = 0; while (target_ucx_iov_idx < target_ucx_iov_count) { if (!is_get) { @@ -263,6 +264,7 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module, prev_len += target_ucx_iov[target_ucx_iov_idx].len; target_ucx_iov_idx++; } + */ } if (origin_ucx_iov != NULL) { @@ -277,19 +279,22 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module, static inline int start_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, int target) { uint64_t result_value = -1; - ucp_rkey_h rkey = (module->state_info_array)[target].rkey; - uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_ACC_LOCK_OFFSET; - ucs_status_t status; + uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_ACC_LOCK_OFFSET; +// ucs_status_t status; while (result_value != TARGET_LOCK_UNLOCKED) { - status = opal_common_ucx_atomic_cswap(ep, TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE, - &result_value, sizeof(result_value), - remote_addr, rkey, - mca_osc_ucx_component.ucp_worker); + // TODO: + /*status = */opal_common_ucx_mem_cmpswp(module->state_mem, + TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE, + target, + &result_value, sizeof(result_value), + remote_addr); +/* if (status != UCS_OK) { OSC_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); return OMPI_ERROR; } +*/ } return OMPI_SUCCESS; @@ -297,16 +302,21 @@ static inline int start_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, in static inline int end_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, int target) { uint64_t result_value = 0; - ucp_rkey_h rkey = (module->state_info_array)[target].rkey; - uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_ACC_LOCK_OFFSET; - int ret; - - ret = opal_common_ucx_atomic_fetch(ep, UCP_ATOMIC_FETCH_OP_SWAP, TARGET_LOCK_UNLOCKED, - &result_value, sizeof(result_value), - remote_addr, rkey, mca_osc_ucx_component.ucp_worker); - if (OMPI_SUCCESS != ret) { - return ret; + uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_ACC_LOCK_OFFSET; +// int ret; + + // TODO: + /*status = */opal_common_ucx_mem_fetch(module->state_mem, + UCP_ATOMIC_FETCH_OP_SWAP, TARGET_LOCK_UNLOCKED, + target, + &result_value, sizeof(result_value), + remote_addr); +/* + if (status != UCS_OK) { + OSC_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); + return OMPI_ERROR; } +*/ assert(result_value == TARGET_LOCK_EXCLUSIVE); @@ -315,6 +325,7 @@ static inline int end_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, int static inline int get_dynamic_win_info(uint64_t remote_addr, ompi_osc_ucx_module_t *module, ucp_ep_h ep, int target) { +/* ucp_rkey_h state_rkey = (module->state_info_array)[target].rkey; uint64_t remote_state_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_DYNAMIC_WIN_CNT_OFFSET; size_t len = sizeof(uint64_t) + sizeof(ompi_osc_dynamic_win_info_t) * OMPI_OSC_UCX_ATTACH_MAX; @@ -361,40 +372,36 @@ static inline int get_dynamic_win_info(uint64_t remote_addr, ompi_osc_ucx_module free(temp_buf); return status; + */ + return OMPI_SUCCESS; } int ompi_osc_ucx_put(const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); - uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target); - ucp_rkey_h rkey; + uint64_t remote_addr = (module->addrs[target]) + target_disp * OSC_UCX_GET_DISP(module, target); bool is_origin_contig = false, is_target_contig = false; ptrdiff_t origin_lb, origin_extent, target_lb, target_extent; - ucs_status_t status; + /*ucs_status_t*/ int status; int ret = OMPI_SUCCESS; ret = check_sync_state(module, target, false); if (ret != OMPI_SUCCESS) { return ret; } - +/* if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { status = get_dynamic_win_info(remote_addr, module, ep, target); if (status != UCS_OK) { return OMPI_ERROR; } } - - CHECK_VALID_RKEY(module, target, target_count); - +*/ if (!target_count) { return OMPI_SUCCESS; } - rkey = (module->win_info_array[target]).rkey; - ompi_datatype_get_true_extent(origin_dt, &origin_lb, &origin_extent); ompi_datatype_get_true_extent(target_dt, &target_lb, &target_extent); @@ -408,16 +415,18 @@ int ompi_osc_ucx_put(const void *origin_addr, int origin_count, struct ompi_data ompi_datatype_type_size(origin_dt, &origin_len); origin_len *= origin_count; - status = ucp_put_nbi(ep, (void *)((intptr_t)origin_addr + origin_lb), origin_len, - remote_addr + target_lb, rkey); - if (status != UCS_OK && status != UCS_INPROGRESS) { + status = opal_common_ucx_mem_putget(module->mem, OPAL_COMMON_UCX_PUT, + target, + (void *)((intptr_t)origin_addr + origin_lb), + origin_len, remote_addr + target_lb); + if (OPAL_SUCCESS != status) { OSC_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status); return OMPI_ERROR; } - return incr_and_check_ops_num(module, target, ep); + return incr_and_check_ops_num(module, target); } else { return ddt_put_get(module, origin_addr, origin_count, origin_dt, is_origin_contig, - origin_lb, target, ep, remote_addr, rkey, target_count, target_dt, + origin_lb, target, NULL, remote_addr, NULL, target_count, target_dt, is_target_contig, target_lb, false); } } @@ -426,6 +435,7 @@ int ompi_osc_ucx_get(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win) { +/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target); @@ -447,13 +457,13 @@ int ompi_osc_ucx_get(void *origin_addr, int origin_count, } } - CHECK_VALID_RKEY(module, target, target_count); + //CHECK_VALID_RKEY(module, target, target_count); if (!target_count) { return OMPI_SUCCESS; } - rkey = (module->win_info_array[target]).rkey; +// rkey = (module->win_info_array[target]).rkey; ompi_datatype_get_true_extent(origin_dt, &origin_lb, &origin_extent); ompi_datatype_get_true_extent(target_dt, &target_lb, &target_extent); @@ -462,7 +472,7 @@ int ompi_osc_ucx_get(void *origin_addr, int origin_count, is_target_contig = ompi_datatype_is_contiguous_memory_layout(target_dt, target_count); if (is_origin_contig && is_target_contig) { - /* fast path */ + // fast path size_t origin_len; ompi_datatype_type_size(origin_dt, &origin_len); @@ -475,12 +485,14 @@ int ompi_osc_ucx_get(void *origin_addr, int origin_count, return OMPI_ERROR; } - return incr_and_check_ops_num(module, target, ep); + return incr_and_check_ops_num(module, target); } else { return ddt_put_get(module, origin_addr, origin_count, origin_dt, is_origin_contig, origin_lb, target, ep, remote_addr, rkey, target_count, target_dt, is_target_contig, target_lb, true); } +*/ + return OMPI_SUCCESS; } int ompi_osc_ucx_accumulate(const void *origin_addr, int origin_count, @@ -488,6 +500,7 @@ int ompi_osc_ucx_accumulate(const void *origin_addr, int origin_count, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win) { +/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); int ret = OMPI_SUCCESS; @@ -606,12 +619,15 @@ int ompi_osc_ucx_accumulate(const void *origin_addr, int origin_count, ret = end_atomicity(module, ep, target); return ret; + */ + return OMPI_SUCCESS; } int ompi_osc_ucx_compare_and_swap(const void *origin_addr, const void *compare_addr, void *result_addr, struct ompi_datatype_t *dt, int target, ptrdiff_t target_disp, struct ompi_win_t *win) { +/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module; ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target); @@ -648,18 +664,22 @@ int ompi_osc_ucx_compare_and_swap(const void *origin_addr, const void *compare_a ucp_request_release(req); } - ret = incr_and_check_ops_num(module, target, ep); + ret = incr_and_check_ops_num(module, target); if (ret != OMPI_SUCCESS) { return ret; } return end_atomicity(module, ep, target); + */ + return OMPI_SUCCESS; } int ompi_osc_ucx_fetch_and_op(const void *origin_addr, void *result_addr, struct ompi_datatype_t *dt, int target, ptrdiff_t target_disp, struct ompi_op_t *op, struct ompi_win_t *win) { + +/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; int ret = OMPI_SUCCESS; @@ -710,7 +730,7 @@ int ompi_osc_ucx_fetch_and_op(const void *origin_addr, void *result_addr, ucp_request_release(req); } - ret = incr_and_check_ops_num(module, target, ep); + ret = incr_and_check_ops_num(module, target); if (ret != OMPI_SUCCESS) { return ret; } @@ -720,6 +740,8 @@ int ompi_osc_ucx_fetch_and_op(const void *origin_addr, void *result_addr, return ompi_osc_ucx_get_accumulate(origin_addr, 1, dt, result_addr, 1, dt, target, target_disp, 1, dt, op, win); } + */ + return OMPI_SUCCESS; } int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count, @@ -729,6 +751,8 @@ int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win) { +/* + ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); int ret = OMPI_SUCCESS; @@ -851,6 +875,8 @@ int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count, ret = end_atomicity(module, ep, target); return ret; + */ + return OMPI_SUCCESS; } int ompi_osc_ucx_rput(const void *origin_addr, int origin_count, @@ -858,6 +884,8 @@ int ompi_osc_ucx_rput(const void *origin_addr, int origin_count, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win, struct ompi_request_t **request) { + + /* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); uint64_t remote_addr = (module->state_info_array[target]).addr + OSC_UCX_STATE_REQ_FLAG_OFFSET; @@ -911,7 +939,9 @@ int ompi_osc_ucx_rput(const void *origin_addr, int origin_count, *request = &ucx_req->super; - return incr_and_check_ops_num(module, target, ep); + return incr_and_check_ops_num(module, target); + */ + return OMPI_SUCCESS; } int ompi_osc_ucx_rget(void *origin_addr, int origin_count, @@ -919,6 +949,7 @@ int ompi_osc_ucx_rget(void *origin_addr, int origin_count, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win, struct ompi_request_t **request) { +/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); uint64_t remote_addr = (module->state_info_array[target]).addr + OSC_UCX_STATE_REQ_FLAG_OFFSET; @@ -972,7 +1003,9 @@ int ompi_osc_ucx_rget(void *origin_addr, int origin_count, *request = &ucx_req->super; - return incr_and_check_ops_num(module, target, ep); + return incr_and_check_ops_num(module, target); + */ + return OMPI_SUCCESS; } int ompi_osc_ucx_raccumulate(const void *origin_addr, int origin_count, @@ -980,6 +1013,7 @@ int ompi_osc_ucx_raccumulate(const void *origin_addr, int origin_count, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win, struct ompi_request_t **request) { +/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; ompi_osc_ucx_request_t *ucx_req = NULL; int ret = OMPI_SUCCESS; @@ -1002,6 +1036,8 @@ int ompi_osc_ucx_raccumulate(const void *origin_addr, int origin_count, *request = &ucx_req->super; return ret; + */ + return OMPI_SUCCESS; } int ompi_osc_ucx_rget_accumulate(const void *origin_addr, int origin_count, @@ -1012,6 +1048,7 @@ int ompi_osc_ucx_rget_accumulate(const void *origin_addr, int origin_count, struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, struct ompi_win_t *win, struct ompi_request_t **request) { +/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; ompi_osc_ucx_request_t *ucx_req = NULL; int ret = OMPI_SUCCESS; @@ -1037,4 +1074,6 @@ int ompi_osc_ucx_rget_accumulate(const void *origin_addr, int origin_count, *request = &ucx_req->super; return ret; + */ + return OMPI_SUCCESS; } diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index 02f975dbf24..0bcbd9d35d4 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -199,7 +199,7 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in char *name = NULL; long values[2]; int ret = OMPI_SUCCESS; - ucs_status_t status; + //ucs_status_t status; int i, comm_size = ompi_comm_size(comm); bool env_initialized = false; void *state_base = NULL; @@ -305,7 +305,7 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in } if (flavor == MPI_WIN_FLAVOR_ALLOCATE || flavor == MPI_WIN_FLAVOR_CREATE) { - swtich (flavor) { + switch (flavor) { case MPI_WIN_FLAVOR_ALLOCATE: mem_type = OPAL_COMMON_UCX_MEM_ALLOCATE_MAP; break; @@ -344,7 +344,7 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in } else { memcpy(my_info, &zero, sizeof(uint64_t)); } - memcpy(my_info + sizeof(uint64_t), &state_base, sizeof(uint64_t)); + memcpy((char*)my_info + sizeof(uint64_t), &state_base, sizeof(uint64_t)); recv_buf = (char *)calloc(comm_size, 2 * sizeof(uint64_t)); ret = comm->c_coll->coll_allgather((void *)&my_info, 2 * sizeof(uint64_t), @@ -451,6 +451,7 @@ int ompi_osc_find_attached_region_position(ompi_osc_dynamic_win_info_t *dynamic_ } int ompi_osc_ucx_win_attach(struct ompi_win_t *win, void *base, size_t len) { +/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; int insert_index = -1, contain_index; void *rkey_buffer; @@ -482,7 +483,7 @@ int ompi_osc_ucx_win_attach(struct ompi_win_t *win, void *base, size_t len) { } else { insert_index = 0; } - +/* ret = mem_map(&base, len, &(module->local_dynamic_win_info[insert_index].memh), module, MPI_WIN_FLAVOR_CREATE); if (ret != OMPI_SUCCESS) { @@ -510,9 +511,12 @@ int ompi_osc_ucx_win_attach(struct ompi_win_t *win, void *base, size_t len) { ucp_rkey_buffer_release(rkey_buffer); return ret; + */ + return OMPI_SUCCESS; } int ompi_osc_ucx_win_detach(struct ompi_win_t *win, const void *base) { +/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; int insert, contain; @@ -523,7 +527,7 @@ int ompi_osc_ucx_win_detach(struct ompi_win_t *win, const void *base) { (uint64_t)base, 1, &insert); assert(contain >= 0 && (uint64_t)contain < module->state.dynamic_win_count); - /* if we can't find region - just exit */ + // if we can't find region - just exit if (contain < 0) { return OMPI_SUCCESS; } @@ -543,6 +547,8 @@ int ompi_osc_ucx_win_detach(struct ompi_win_t *win, const void *base) { } return OMPI_SUCCESS; + */ + return OMPI_SUCCESS; } int ompi_osc_ucx_free(struct ompi_win_t *win) { @@ -555,8 +561,9 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) { OBJ_DESTRUCT(&module->outstanding_locks); OBJ_DESTRUCT(&module->pending_posts); + /* while (module->state.lock != TARGET_LOCK_UNLOCKED) { - /* not sure if this is required */ + // not sure if this is required ucp_worker_progress(mca_osc_ucx_component.ucp_worker); } @@ -564,10 +571,10 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) { if (OMPI_SUCCESS != ret) { OSC_UCX_VERBOSE(1, "opal_common_ucx_worker_flush failed: %d", ret); } - +*/ ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module); - +/* for (i = 0; i < ompi_comm_size(module->comm); i++) { if ((module->win_info_array[i]).rkey_init == true) { ucp_rkey_destroy((module->win_info_array[i]).rkey); @@ -593,4 +600,6 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) { ompi_osc_ucx_unregister_progress(); return ret; + */ + return OMPI_SUCCESS; } diff --git a/ompi/mca/osc/ucx/osc_ucx_passive_target.c b/ompi/mca/osc/ucx/osc_ucx_passive_target.c index 3a7ad3e9e24..32349f55502 100644 --- a/ompi/mca/osc/ucx/osc_ucx_passive_target.c +++ b/ompi/mca/osc/ucx/osc_ucx_passive_target.c @@ -20,27 +20,32 @@ OBJ_CLASS_INSTANCE(ompi_osc_ucx_lock_t, opal_object_t, NULL, NULL); static inline int start_shared(ompi_osc_ucx_module_t *module, int target) { uint64_t result_value = -1; - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); - ucp_rkey_h rkey = (module->state_info_array)[target].rkey; - uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_LOCK_OFFSET; - ucs_status_t status; - int ret; + uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET; +// ucs_status_t status; +// int ret; while (true) { - ret = opal_common_ucx_atomic_fetch(ep, UCP_ATOMIC_FETCH_OP_FADD, 1, - &result_value, sizeof(result_value), - remote_addr, rkey, mca_osc_ucx_component.ucp_worker); - if (OMPI_SUCCESS != ret) { - return ret; - } + opal_common_ucx_mem_fetch(module->state_mem, UCP_ATOMIC_FETCH_OP_FADD, 1, + target, + &result_value, sizeof(result_value), + remote_addr); + +// if (OMPI_SUCCESS != ret) { +// return ret; +// } + assert((int64_t)result_value >= 0); if (result_value >= TARGET_LOCK_EXCLUSIVE) { - status = ucp_atomic_post(ep, UCP_ATOMIC_POST_OP_ADD, (-1), sizeof(uint64_t), - remote_addr, rkey); - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_atomic_add64 failed: %d", status); - return OMPI_ERROR; - } +// status = ucp_atomic_post(ep, UCP_ATOMIC_POST_OP_ADD, (-1), sizeof(uint64_t), +// remote_addr, rkey); + opal_common_ucx_mem_post(module->state_mem, + UCP_ATOMIC_POST_OP_ADD, (-1), target, + sizeof(uint64_t), + remote_addr); +// if (status != UCS_OK) { +// OSC_UCX_VERBOSE(1, "ucp_atomic_add64 failed: %d", status); +// return OMPI_ERROR; +// } } else { break; } @@ -50,36 +55,45 @@ static inline int start_shared(ompi_osc_ucx_module_t *module, int target) { } static inline int end_shared(ompi_osc_ucx_module_t *module, int target) { - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); - ucp_rkey_h rkey = (module->state_info_array)[target].rkey; - uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_LOCK_OFFSET; - ucs_status_t status; - - status = ucp_atomic_post(ep, UCP_ATOMIC_POST_OP_ADD, (-1), sizeof(uint64_t), - remote_addr, rkey); - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_atomic_post(OP_ADD) failed: %d", status); - return OMPI_ERROR; - } +// ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); +// ucp_rkey_h rkey = (module->state_info_array)[target].rkey; + uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET; +// ucs_status_t status; + +// status = ucp_atomic_post(ep, UCP_ATOMIC_POST_OP_ADD, (-1), sizeof(uint64_t), +// remote_addr, rkey); + opal_common_ucx_mem_post(module->state_mem, + UCP_ATOMIC_POST_OP_ADD, (-1), target, sizeof(uint64_t), + remote_addr); +// if (status != UCS_OK) { +// OSC_UCX_VERBOSE(1, "ucp_atomic_post(OP_ADD) failed: %d", status); +// return OMPI_ERROR; +// } return OMPI_SUCCESS; } static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) { uint64_t result_value = -1; - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); - ucp_rkey_h rkey = (module->state_info_array)[target].rkey; - uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_LOCK_OFFSET; - ucs_status_t status; +// ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); +// ucp_rkey_h rkey = (module->state_info_array)[target].rkey; + uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET; +// ucs_status_t status; while (result_value != TARGET_LOCK_UNLOCKED) { - status = opal_common_ucx_atomic_cswap(ep, TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE, - &result_value, sizeof(result_value), - remote_addr, rkey, - mca_osc_ucx_component.ucp_worker); - if (status != UCS_OK) { - return OMPI_ERROR; - } +// status = opal_common_ucx_atomic_cswap(ep, TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE, +// &result_value, sizeof(result_value), +// remote_addr, rkey, +// mca_osc_ucx_component.ucp_worker); + opal_common_ucx_mem_cmpswp(module->state_mem, + TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE, + target, + &result_value, sizeof(result_value), + remote_addr); + +// if (status != UCS_OK) { +// return OMPI_ERROR; +// } } return OMPI_SUCCESS; @@ -87,17 +101,23 @@ static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) { static inline int end_exclusive(ompi_osc_ucx_module_t *module, int target) { uint64_t result_value = 0; - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); - ucp_rkey_h rkey = (module->state_info_array)[target].rkey; - uint64_t remote_addr = (module->state_info_array)[target].addr + OSC_UCX_STATE_LOCK_OFFSET; - int ret; - - ret = opal_common_ucx_atomic_fetch(ep, UCP_ATOMIC_FETCH_OP_SWAP, TARGET_LOCK_UNLOCKED, - &result_value, sizeof(result_value), - remote_addr, rkey, mca_osc_ucx_component.ucp_worker); - if (OMPI_SUCCESS != ret) { - return ret; - } +// ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); +// ucp_rkey_h rkey = (module->state_info_array)[target].rkey; + uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET; +// int ret; + +// ret = opal_common_ucx_atomic_fetch(ep, UCP_ATOMIC_FETCH_OP_SWAP, TARGET_LOCK_UNLOCKED, +// &result_value, sizeof(result_value), +// remote_addr, rkey, mca_osc_ucx_component.ucp_worker); + + opal_common_ucx_mem_fetch(module->state_mem, + UCP_ATOMIC_FETCH_OP_SWAP, TARGET_LOCK_UNLOCKED, + target, + &result_value, sizeof(result_value), + remote_addr); +// if (OMPI_SUCCESS != ret) { +// return ret; +// } assert(result_value >= TARGET_LOCK_EXCLUSIVE); @@ -158,7 +178,7 @@ int ompi_osc_ucx_unlock(int target, struct ompi_win_t *win) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module; ompi_osc_ucx_lock_t *lock = NULL; int ret = OMPI_SUCCESS; - ucp_ep_h ep; +// ucp_ep_h ep; if (module->epoch_type.access != PASSIVE_EPOCH) { return OMPI_ERR_RMA_SYNC; @@ -172,8 +192,9 @@ int ompi_osc_ucx_unlock(int target, struct ompi_win_t *win) { opal_hash_table_remove_value_uint32(&module->outstanding_locks, (uint32_t)target); - ep = OSC_UCX_GET_EP(module->comm, target); - ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker); +// ep = OSC_UCX_GET_EP(module->comm, target); +// ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker); + opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target); if (ret != OMPI_SUCCESS) { return ret; } @@ -244,10 +265,11 @@ int ompi_osc_ucx_unlock_all(struct ompi_win_t *win) { assert(module->lock_count == 0); - ret = opal_common_ucx_worker_flush(mca_osc_ucx_component.ucp_worker); - if (ret != OMPI_SUCCESS) { - return ret; - } + //ret = opal_common_ucx_worker_flush(mca_osc_ucx_component.ucp_worker); + opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0); +// if (ret != OMPI_SUCCESS) { +// return ret; +// } module->global_ops_num = 0; memset(module->per_target_ops_nums, 0, sizeof(int) * comm_size); @@ -265,40 +287,42 @@ int ompi_osc_ucx_unlock_all(struct ompi_win_t *win) { } int ompi_osc_ucx_sync(struct ompi_win_t *win) { - ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module; - ucs_status_t status; - if (module->epoch_type.access != PASSIVE_EPOCH && - module->epoch_type.access != PASSIVE_ALL_EPOCH) { - return OMPI_ERR_RMA_SYNC; - } +// ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module; +// ucs_status_t status; - opal_atomic_mb(); +// if (module->epoch_type.access != PASSIVE_EPOCH && +// module->epoch_type.access != PASSIVE_ALL_EPOCH) { +// return OMPI_ERR_RMA_SYNC; +// } - status = ucp_worker_fence(mca_osc_ucx_component.ucp_worker); - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_worker_fence failed: %d", status); - return OMPI_ERROR; - } +// opal_atomic_mb(); + +// status = ucp_worker_fence(mca_osc_ucx_component.ucp_worker); +// if (status != UCS_OK) { +// OSC_UCX_VERBOSE(1, "ucp_worker_fence failed: %d", status); +// return OMPI_ERROR; +// } return OMPI_SUCCESS; } int ompi_osc_ucx_flush(int target, struct ompi_win_t *win) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; - ucp_ep_h ep; - int ret; +// ucp_ep_h ep; +// int ret; if (module->epoch_type.access != PASSIVE_EPOCH && module->epoch_type.access != PASSIVE_ALL_EPOCH) { return OMPI_ERR_RMA_SYNC; } - ep = OSC_UCX_GET_EP(module->comm, target); - ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker); - if (ret != OMPI_SUCCESS) { - return ret; - } +// ep = OSC_UCX_GET_EP(module->comm, target); +// ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker); + opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target); +// if (ret != OMPI_SUCCESS) { +// return ret; +// } module->global_ops_num -= module->per_target_ops_nums[target]; module->per_target_ops_nums[target] = 0; @@ -308,17 +332,19 @@ int ompi_osc_ucx_flush(int target, struct ompi_win_t *win) { int ompi_osc_ucx_flush_all(struct ompi_win_t *win) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module; - int ret; +// int ret; if (module->epoch_type.access != PASSIVE_EPOCH && module->epoch_type.access != PASSIVE_ALL_EPOCH) { return OMPI_ERR_RMA_SYNC; } - ret = opal_common_ucx_worker_flush(mca_osc_ucx_component.ucp_worker); - if (ret != OMPI_SUCCESS) { - return ret; - } +// ret = opal_common_ucx_worker_flush(mca_osc_ucx_component.ucp_worker); + opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0); + +// if (ret != OMPI_SUCCESS) { +// return ret; +// } module->global_ops_num = 0; memset(module->per_target_ops_nums, 0, diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 98e987d961e..4f02325b993 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -99,9 +99,7 @@ opal_common_ucx_mem_op(opal_common_ucx_mem_t *mem, int target, void *buffer, size_t len, uint64_t rem_addr); -OPAL_DECLSPEC void opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, - opal_common_ucx_flush_scope_t scope, - int target); + static int _tlocal_tls_ctxtbl_extend(_tlocal_table_t *tbl, size_t append); static int _tlocal_tls_memtbl_extend(_tlocal_table_t *tbl, size_t append); @@ -1118,13 +1116,9 @@ static int _tlocal_mem_create_rkey(_tlocal_mem_t *mem_rec, ucp_ep_h ep, int targ return OPAL_SUCCESS; } - - -OPAL_DECLSPEC int opal_common_ucx_mem_op(opal_common_ucx_mem_t *mem, - opal_common_ucx_op_t op, - int target, - void *buffer, size_t len, - uint64_t rem_addr) +static inline int _tlocal_fetch(opal_common_ucx_mem_t *mem, int target, + ucp_ep_h *_ep, ucp_rkey_h *_rkey, + _worker_info_t **_winfo) { _tlocal_table_t *tls = NULL; _tlocal_ctx_t *ctx_rec; @@ -1133,7 +1127,6 @@ OPAL_DECLSPEC int opal_common_ucx_mem_op(opal_common_ucx_mem_t *mem, _mem_info_t *mem_info; ucp_ep_h ep; ucp_rkey_h rkey; - ucs_status_t status; int rc; tls = _tlocal_get_tls(mem->ctx->wpool); @@ -1143,6 +1136,7 @@ OPAL_DECLSPEC int opal_common_ucx_mem_op(opal_common_ucx_mem_t *mem, ctx_rec = _tlocal_add_ctx(tls, mem->ctx); if (NULL == ctx_rec) { // TODO: err handling + return OPAL_ERR_OUT_OF_RESOURCE; } } winfo = ctx_rec->winfo; @@ -1162,6 +1156,7 @@ OPAL_DECLSPEC int opal_common_ucx_mem_op(opal_common_ucx_mem_t *mem, mem_rec = _tlocal_add_mem(tls, mem); if (NULL == mem_rec) { // TODO: err handling + return OPAL_ERR_OUT_OF_RESOURCE; } } mem_info = mem_rec->mem; @@ -1174,9 +1169,34 @@ OPAL_DECLSPEC int opal_common_ucx_mem_op(opal_common_ucx_mem_t *mem, return rc; } } - rkey = mem_info->rkeys[target]; - /* Perform the operation */ + *_ep = ep; + *_rkey = rkey = mem_info->rkeys[target]; + *_winfo = winfo; + return OPAL_SUCCESS; +} + + + +OPAL_DECLSPEC int opal_common_ucx_mem_putget(opal_common_ucx_mem_t *mem, + opal_common_ucx_op_t op, + int target, + void *buffer, size_t len, + uint64_t rem_addr) +{ + ucp_ep_h ep; + ucp_rkey_h rkey; + ucs_status_t status; + _worker_info_t *winfo; + int rc; + + rc =_tlocal_fetch(mem, target, &ep, &rkey, &winfo); + if( rc ){ + // TODO: err handling + return rc; + } + + /* Perform the operation */ opal_mutex_lock(&winfo->mutex); switch(op){ case OPAL_COMMON_UCX_GET: @@ -1200,6 +1220,109 @@ OPAL_DECLSPEC int opal_common_ucx_mem_op(opal_common_ucx_mem_t *mem, return OPAL_SUCCESS; } + +OPAL_DECLSPEC int opal_common_ucx_mem_cmpswp(opal_common_ucx_mem_t *mem, + uint64_t compare, uint64_t value, + int target, + void *buffer, size_t len, + uint64_t rem_addr) +{ + ucp_ep_h ep; + ucp_rkey_h rkey; + _worker_info_t *winfo; + ucs_status_t status; + int rc; + + rc =_tlocal_fetch(mem, target, &ep, &rkey, &winfo); + if( rc ){ + // TODO: err handling + return rc; + } + + /* Perform the operation */ + opal_mutex_lock(&winfo->mutex); + status = opal_common_ucx_atomic_cswap(ep, compare, value, + buffer, len, + rem_addr, rkey, + winfo->worker); + if (status != UCS_OK) { + // TODO: OSC_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); + return OPAL_ERROR; + } + opal_mutex_unlock(&winfo->mutex); + return OPAL_SUCCESS; +} + +OPAL_DECLSPEC int opal_common_ucx_mem_fetch(opal_common_ucx_mem_t *mem, + ucp_atomic_fetch_op_t opcode, uint64_t value, + int target, + void *buffer, size_t len, + uint64_t rem_addr) +{ + ucp_ep_h ep; + ucp_rkey_h rkey; + _worker_info_t *winfo; + ucs_status_t status; + int rc; + + rc =_tlocal_fetch(mem, target, &ep, &rkey, &winfo); + if( rc ){ + // TODO: err handling + return rc; + } + + /* Perform the operation */ + opal_mutex_lock(&winfo->mutex); + status = opal_common_ucx_atomic_fetch(ep, opcode, value, + buffer, len, + rem_addr, rkey, + winfo->worker); + if (status != UCS_OK) { + // TODO: OSC_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); + return OPAL_ERROR; + } + opal_mutex_unlock(&winfo->mutex); + return OPAL_SUCCESS; +} + +//ucs_status_t ucp_atomic_post(ucp_ep_h ep, ucp_atomic_post_op_t opcode, uint64_t value, +// size_t op_size, uint64_t remote_addr, ucp_rkey_h rkey); + + +OPAL_DECLSPEC int opal_common_ucx_mem_post(opal_common_ucx_mem_t *mem, + ucp_atomic_post_op_t opcode, + uint64_t value, + int target, + size_t len, + uint64_t rem_addr) +{ + ucp_ep_h ep; + ucp_rkey_h rkey; + _worker_info_t *winfo; + ucs_status_t status; + int rc; + + rc =_tlocal_fetch(mem, target, &ep, &rkey, &winfo); + if( rc ){ + // TODO: err handling + return rc; + } + + /* Perform the operation */ + opal_mutex_lock(&winfo->mutex); + status = ucp_atomic_post(ep, opcode, value, + len, + rem_addr, rkey); + if (status != UCS_OK) { + // TODO: OSC_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); + return OPAL_ERROR; + } + opal_mutex_unlock(&winfo->mutex); + return OPAL_SUCCESS; +} + + +// TODO: return sttaus OPAL_DECLSPEC void opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, opal_common_ucx_flush_scope_t scope, @@ -1229,4 +1352,5 @@ opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, OPAL_DECLSPEC int opal_common_ucx_workers_progress(opal_common_ucx_wpool_t *wpool) { // TODO + return 0; } diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 230cd7aa8ad..fac588390d7 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -165,7 +165,32 @@ OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int com opal_common_ucx_exchange_func_t exchange_func, void *exchange_metadata, opal_common_ucx_mem_t **mem_ptr); +OPAL_DECLSPEC void opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, + opal_common_ucx_flush_scope_t scope, + int target); OPAL_DECLSPEC int opal_common_ucx_workers_progress(opal_common_ucx_wpool_t *wpool); +OPAL_DECLSPEC int opal_common_ucx_mem_cmpswp(opal_common_ucx_mem_t *mem, + uint64_t compare, uint64_t value, + int target, + void *buffer, size_t len, + uint64_t rem_addr); +OPAL_DECLSPEC int opal_common_ucx_mem_putget(opal_common_ucx_mem_t *mem, + opal_common_ucx_op_t op, + int target, + void *buffer, size_t len, + uint64_t rem_addr); +OPAL_DECLSPEC int opal_common_ucx_mem_fetch(opal_common_ucx_mem_t *mem, + ucp_atomic_fetch_op_t opcode, uint64_t value, + int target, + void *buffer, size_t len, + uint64_t rem_addr); +OPAL_DECLSPEC int opal_common_ucx_mem_post(opal_common_ucx_mem_t *mem, + ucp_atomic_post_op_t opcode, + uint64_t value, + int target, + size_t len, + uint64_t rem_addr); + OPAL_DECLSPEC void opal_common_ucx_mca_register(void); From 4c10a49748767fadf3c713dcf5d6e70aa0baf1d0 Mon Sep 17 00:00:00 2001 From: "Artem Y. Polyakov" Date: Mon, 12 Nov 2018 18:41:44 -0800 Subject: [PATCH 17/59] fix --- opal/mca/common/ucx/common_ucx.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 4f02325b993..3425356e600 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -27,7 +27,6 @@ typedef struct { size_t comm_size; } _worker_info_t; -OBJ_CLASS_DECLARATION(_worker_info_t); typedef struct { int ctx_id; @@ -36,15 +35,11 @@ typedef struct { _worker_info_t *winfo; } _tlocal_ctx_t; -OBJ_CLASS_DECLARATION(_tlocal_ctx_t); - typedef struct { _worker_info_t *worker; ucp_rkey_h *rkeys; } _mem_info_t; -OBJ_CLASS_DECLARATION(_mem_info_t); - typedef struct { int mem_id; int is_freed; @@ -52,8 +47,6 @@ typedef struct { _mem_info_t *mem; } _tlocal_mem_t; -OBJ_CLASS_DECLARATION(_tlocal_mem_t); - typedef struct { opal_list_item_t super; _worker_info_t *ptr; @@ -460,7 +453,7 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, goto err_worker_create; } - wkr = OBJ_NEW(_worker_info_t); + wkr = calloc(1, sizeof(_worker_info_t)); OBJ_CONSTRUCT(&wkr->mutex, opal_mutex_t); wkr->worker = wpool->recv_worker; wkr->endpoints = NULL; From 9ad314547cfb0c9c6658490c174182e22a2c4ec5 Mon Sep 17 00:00:00 2001 From: "Artem Y. Polyakov" Date: Mon, 12 Nov 2018 18:50:31 -0800 Subject: [PATCH 18/59] fix #2 --- opal/mca/common/ucx/common_ucx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 3425356e600..841daac0bda 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -386,7 +386,7 @@ static _worker_info_t* _wpool_remove_from_idle(opal_common_ucx_wpool_t *wpool) OPAL_DECLSPEC opal_common_ucx_wpool_t * opal_common_ucx_wpool_allocate(void) { - opal_common_ucx_wpool_t *ptr = calloc(1, sizeof(opal_common_ucx_wpool_t *)); + opal_common_ucx_wpool_t *ptr = calloc(1, sizeof(opal_common_ucx_wpool_t)); ptr->refcnt = 0; return ptr; } From 0c962a5d5f8d32c81f4059a70262ddb316895016 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 12 Nov 2018 19:22:12 -0800 Subject: [PATCH 19/59] fix --- ompi/mca/osc/ucx/osc_ucx_component.c | 2 +- opal/mca/common/ucx/common_ucx.c | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index 0bcbd9d35d4..8f8a2bc42ca 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -324,7 +324,7 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in } state_base = (void *)&(module->state); - ret = opal_common_ucx_mem_create(module->ctx, comm_size, state_base, + ret = opal_common_ucx_mem_create(module->ctx, comm_size, &state_base, sizeof(ompi_osc_ucx_state_t), OPAL_COMMON_UCX_MEM_MAP, &exchange_len_info, (void *)module->comm, &module->state_mem); diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 841daac0bda..b2d7fd63274 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -416,6 +416,7 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, wpool->refcnt++; wpool->cur_ctxid = wpool->cur_memid = 0; OBJ_CONSTRUCT(&wpool->mutex, opal_mutex_t); + OBJ_CONSTRUCT(&wpool->tls_list, opal_list_t); status = ucp_config_read("MPI", NULL, &config); if (UCS_OK != status) { @@ -455,6 +456,7 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, wkr = calloc(1, sizeof(_worker_info_t)); OBJ_CONSTRUCT(&wkr->mutex, opal_mutex_t); + wkr->worker = wpool->recv_worker; wkr->endpoints = NULL; wkr->comm_size = 0; @@ -512,7 +514,7 @@ OPAL_DECLSPEC void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool opal_mutex_unlock(&wpool->mutex); OBJ_DESTRUCT(&wpool->idle_workers); - + OBJ_DESTRUCT(&wpool->tls_list); OBJ_DESTRUCT(&wpool->mutex); ucp_worker_release_address(wpool->recv_worker, wpool->recv_waddr); ucp_worker_destroy(wpool->recv_worker); @@ -636,7 +638,7 @@ static int _comm_ucx_mem_map(opal_common_ucx_wpool_t *wpool, if (status != UCS_OK) { MCA_COMMON_UCX_VERBOSE(1, "ucp_mem_map failed: %d", status); ret = OPAL_ERROR; - goto error; + return ret; } mem_attrs.field_mask = UCP_MEM_ATTR_FIELD_ADDRESS | UCP_MEM_ATTR_FIELD_LENGTH; @@ -963,7 +965,7 @@ static int _tlocal_ctx_connect(_tlocal_ctx_t *ctx_rec, int target) displ = gctx->recv_worker_displs[target]; ep_params.address = (ucp_address_t *)&(gctx->recv_worker_addrs[displ]); status = ucp_ep_create(winfo->worker, &ep_params, &winfo->endpoints[target]); - opal_mutex_lock(&winfo->mutex); + opal_mutex_unlock(&winfo->mutex); if (status != UCS_OK) { // TODO: error out here From 556875eb4365f409f525e7190281172aac1bfcdb Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 12 Nov 2018 20:13:19 -0800 Subject: [PATCH 20/59] fix --- ompi/mca/osc/ucx/osc_ucx_component.c | 7 ++++--- ompi/mca/osc/ucx/osc_ucx_passive_target.c | 9 +++++++++ opal/mca/common/ucx/common_ucx.c | 13 ++++++++++--- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index 8f8a2bc42ca..391cee773c1 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -347,7 +347,7 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in memcpy((char*)my_info + sizeof(uint64_t), &state_base, sizeof(uint64_t)); recv_buf = (char *)calloc(comm_size, 2 * sizeof(uint64_t)); - ret = comm->c_coll->coll_allgather((void *)&my_info, 2 * sizeof(uint64_t), + ret = comm->c_coll->coll_allgather((void *)my_info, 2 * sizeof(uint64_t), MPI_BYTE, recv_buf, 2 * sizeof(uint64_t), MPI_BYTE, comm, comm->c_coll->coll_allgather_module); if (ret != OMPI_SUCCESS) { @@ -357,8 +357,8 @@ static int component_select(struct ompi_win_t *win, void **base, size_t size, in module->addrs = calloc(comm_size, sizeof(uint64_t)); module->state_addrs = calloc(comm_size, sizeof(uint64_t)); for (i = 0; i < comm_size; i++) { - memcpy(&(module->addrs[i]), recv_buf, sizeof(uint64_t)); - memcpy(&(module->state_addrs[i]), recv_buf + sizeof(uint64_t), sizeof(uint64_t)); + memcpy(&(module->addrs[i]), recv_buf + i * 2 * sizeof(uint64_t), sizeof(uint64_t)); + memcpy(&(module->state_addrs[i]), recv_buf + i * 2 * sizeof(uint64_t) + sizeof(uint64_t), sizeof(uint64_t)); } free(recv_buf); @@ -601,5 +601,6 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) { return ret; */ + opal_common_ucx_wpool_finalize(mca_osc_ucx_component.wpool); return OMPI_SUCCESS; } diff --git a/ompi/mca/osc/ucx/osc_ucx_passive_target.c b/ompi/mca/osc/ucx/osc_ucx_passive_target.c index 32349f55502..4b05d1e7528 100644 --- a/ompi/mca/osc/ucx/osc_ucx_passive_target.c +++ b/ompi/mca/osc/ucx/osc_ucx_passive_target.c @@ -24,6 +24,15 @@ static inline int start_shared(ompi_osc_ucx_module_t *module, int target) { // ucs_status_t status; // int ret; +/* + volatile int delay = 1; + while( delay ){ + sleep(1); + } +*/ + + return OMPI_SUCCESS; + while (true) { opal_common_ucx_mem_fetch(module->state_mem, UCP_ATOMIC_FETCH_OP_FADD, 1, target, diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index b2d7fd63274..3fe18d91df8 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -814,10 +814,9 @@ _tlocal_tls_get_worker(_tlocal_table_t *tls, _worker_info_t **_winfo) return OPAL_ERR_OUT_OF_RESOURCE; } OBJ_CONSTRUCT(&winfo->mutex, opal_mutex_t); - _create_ctx_worker(tls->wpool); + winfo->worker = _create_ctx_worker(tls->wpool); winfo->endpoints = NULL; winfo->comm_size = 0; - opal_mutex_unlock(&tls->wpool->mutex); } *_winfo = winfo; @@ -844,7 +843,7 @@ static int _tlocal_tls_memtbl_extend(_tlocal_table_t *tbl, size_t append) { size_t i; - size_t newsize = (tbl->ctx_tbl_size + append); + size_t newsize = (tbl->mem_tbl_size + append); tbl->mem_tbl = realloc(tbl->mem_tbl, newsize * sizeof(*tbl->mem_tbl)); for (i = tbl->mem_tbl_size; i < tbl->mem_tbl_size + append; i++) { @@ -1325,6 +1324,14 @@ opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, { _worker_list_item_t *item; opal_common_ucx_ctx_t *ctx = mem->ctx; + + /* + volatile int delay = 1; + while( delay ){ + sleep(1); + } + */ + opal_mutex_lock(&ctx->mutex); OPAL_LIST_FOREACH(item, &ctx->workers, _worker_list_item_t) { switch (scope) { From d3ebdc39a69bae52b6b038be06546d15f71f519d Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 14:11:58 -0800 Subject: [PATCH 21/59] add printfs --- opal/mca/common/ucx/common_ucx.c | 368 ++++++++++++++++++++----------- opal/mca/common/ucx/common_ucx.h | 6 +- 2 files changed, 243 insertions(+), 131 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 3fe18d91df8..0f364b05a0d 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -86,30 +86,17 @@ OBJ_CLASS_INSTANCE(_tlocal_table_t, opal_list_item_t, NULL, NULL); static pthread_key_t _tlocal_key = {0}; -OPAL_DECLSPEC int -opal_common_ucx_mem_op(opal_common_ucx_mem_t *mem, - opal_common_ucx_op_t op, - int target, - void *buffer, size_t len, - uint64_t rem_addr); - - static int _tlocal_tls_ctxtbl_extend(_tlocal_table_t *tbl, size_t append); static int _tlocal_tls_memtbl_extend(_tlocal_table_t *tbl, size_t append); static _tlocal_table_t* _common_ucx_tls_init(opal_common_ucx_wpool_t *wpool); -static inline _tlocal_ctx_t *_tlocal_ctx_search(_tlocal_table_t *tls, - int ctx_id); +static inline _tlocal_ctx_t *_tlocal_ctx_search(_tlocal_table_t *tls, int ctx_id); static int _tlocal_cleanup_ctx_record(_tlocal_ctx_t *ctx_rec); -static _tlocal_ctx_t *_tlocal_add_ctx(_tlocal_table_t *tls, - opal_common_ucx_ctx_t *ctx); +static _tlocal_ctx_t *_tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx); static int _tlocal_ctx_connect(_tlocal_ctx_t *ctx, int target); static int _tlocal_ctx_release(opal_common_ucx_ctx_t *ctx); -static inline _tlocal_mem_t *_tlocal_search_mem(_tlocal_table_t *tls, - int mem_id); -static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls, - opal_common_ucx_mem_t *mem); -static int _tlocal_mem_create_rkey(_tlocal_mem_t *mem_rec, ucp_ep_h ep, - int target); +static inline _tlocal_mem_t *_tlocal_search_mem(_tlocal_table_t *tls, int mem_id); +static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls, opal_common_ucx_mem_t *mem); +static int _tlocal_mem_create_rkey(_tlocal_mem_t *mem_rec, ucp_ep_h ep, int target); /***********************************************************************/ @@ -320,7 +307,8 @@ static inline void _cleanup_tlocal(void *arg) // 2. Return all workers into the idle pool } -static ucp_worker_h _create_ctx_worker(opal_common_ucx_wpool_t *wpool) +static +ucp_worker_h _create_ctx_worker(opal_common_ucx_wpool_t *wpool) { ucp_worker_params_t worker_params; ucp_worker_h worker; @@ -331,14 +319,17 @@ static ucp_worker_h _create_ctx_worker(opal_common_ucx_wpool_t *wpool) worker_params.thread_mode = UCS_THREAD_MODE_SINGLE; status = ucp_worker_create(wpool->ucp_ctx, &worker_params, &worker); if (UCS_OK != status) { + MCA_COMMON_UCX_VERBOSE(1, "ucp_worker_create failed: %d", status); return NULL; } + printf("_create_ctx_worker: worker = %p\n", (void *)worker); + return worker; } -static int _wpool_add_to_idle(opal_common_ucx_wpool_t *wpool, - _worker_info_t *winfo) +static +int _wpool_add_to_idle(opal_common_ucx_wpool_t *wpool, _worker_info_t *winfo) { _idle_list_item_t *item; @@ -361,10 +352,13 @@ static int _wpool_add_to_idle(opal_common_ucx_wpool_t *wpool, opal_mutex_lock(&wpool->mutex); opal_list_append(&wpool->idle_workers, &item->super); opal_mutex_unlock(&wpool->mutex); + + printf("_wpool_add_to_idle: wpool = %p winfo = %p\n", (void *)wpool, (void *)winfo); return OPAL_SUCCESS; } -static _worker_info_t* _wpool_remove_from_idle(opal_common_ucx_wpool_t *wpool) +static +_worker_info_t* _wpool_remove_from_idle(opal_common_ucx_wpool_t *wpool) { _worker_info_t *wkr = NULL; _idle_list_item_t *item = NULL; @@ -381,26 +375,34 @@ static _worker_info_t* _wpool_remove_from_idle(opal_common_ucx_wpool_t *wpool) OBJ_RELEASE(item); } + printf("_wpool_remove_from_idle: wpool = %p\n", (void *)wpool); return wkr; } -OPAL_DECLSPEC opal_common_ucx_wpool_t * opal_common_ucx_wpool_allocate(void) +OPAL_DECLSPEC +opal_common_ucx_wpool_t * opal_common_ucx_wpool_allocate(void) { opal_common_ucx_wpool_t *ptr = calloc(1, sizeof(opal_common_ucx_wpool_t)); ptr->refcnt = 0; + + printf("opal_common_ucx_wpool_allocate: wpool = %p\n", (void *)ptr); return ptr; } -OPAL_DECLSPEC void opal_common_ucx_wpool_free(opal_common_ucx_wpool_t *wpool) +OPAL_DECLSPEC +void opal_common_ucx_wpool_free(opal_common_ucx_wpool_t *wpool) { assert(wpool->refcnt == 0); + + printf("opal_common_ucx_wpool_free: wpool = %p\n", (void *)wpool); free(wpool); } -OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, - int proc_world_size, - ucp_request_init_callback_t req_init_ptr, - size_t req_size, bool enable_mt) +OPAL_DECLSPEC +int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, + int proc_world_size, + ucp_request_init_callback_t req_init_ptr, + size_t req_size, bool enable_mt) { ucp_config_t *config = NULL; ucp_params_t context_params; @@ -476,6 +478,7 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, pthread_key_create(&_tlocal_key, _cleanup_tlocal); + printf("opal_common_ucx_wpool_init: wpool = %p\n", (void *)wpool); return rc; err_wpool_add: @@ -490,10 +493,12 @@ OPAL_DECLSPEC int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, return rc; } -OPAL_DECLSPEC void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) +OPAL_DECLSPEC +void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) { wpool->refcnt--; if (wpool->refcnt > 0) { + printf("opal_common_ucx_wpool_finalize: wpool = %p\n", (void *)wpool); return; } @@ -519,12 +524,15 @@ OPAL_DECLSPEC void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool ucp_worker_release_address(wpool->recv_worker, wpool->recv_waddr); ucp_worker_destroy(wpool->recv_worker); ucp_cleanup(wpool->ucp_ctx); + printf("opal_common_ucx_wpool_finalize: wpool = %p\n", (void *)wpool); + return; } -OPAL_DECLSPEC int opal_common_ucx_ctx_create(opal_common_ucx_wpool_t *wpool, int comm_size, - opal_common_ucx_exchange_func_t exchange_func, - void *exchange_metadata, - opal_common_ucx_ctx_t **ctx_ptr) +OPAL_DECLSPEC +int opal_common_ucx_ctx_create(opal_common_ucx_wpool_t *wpool, int comm_size, + opal_common_ucx_exchange_func_t exchange_func, + void *exchange_metadata, + opal_common_ucx_ctx_t **ctx_ptr) { opal_common_ucx_ctx_t *ctx = calloc(1, sizeof(*ctx)); int ret = OPAL_SUCCESS; @@ -546,6 +554,7 @@ OPAL_DECLSPEC int opal_common_ucx_ctx_create(opal_common_ucx_wpool_t *wpool, int } (*ctx_ptr) = ctx; + printf("opal_common_ucx_wpool_create: wpool = %p, (*ctx_ptr) = %p\n", (void *)wpool, (void *)(*ctx_ptr)); return ret; error: @@ -562,6 +571,7 @@ static void _common_ucx_ctx_free(opal_common_ucx_ctx_t *ctx) free(ctx->recv_worker_displs); OBJ_DESTRUCT(&ctx->mutex); OBJ_DESTRUCT(&ctx->workers); + printf("_common_ucx_ctx_free: ctx = %p\n", (void *)ctx); free(ctx); } @@ -569,12 +579,12 @@ OPAL_DECLSPEC void opal_common_ucx_ctx_release(opal_common_ucx_ctx_t *ctx) { // TODO: implement + printf("opal_common_ucx_ctx_release: ctx = %p\n", (void *)ctx); _tlocal_ctx_release(ctx); } static int -_common_ucx_ctx_append(opal_common_ucx_ctx_t *ctx, - _tlocal_ctx_t *ctx_rec) +_common_ucx_ctx_append(opal_common_ucx_ctx_t *ctx, _tlocal_ctx_t *ctx_rec) { _worker_list_item_t *item = OBJ_NEW(_worker_list_item_t); if (NULL == item) { @@ -584,6 +594,7 @@ _common_ucx_ctx_append(opal_common_ucx_ctx_t *ctx, opal_mutex_lock(&ctx->mutex); opal_list_append(&ctx->workers, &item->super); opal_mutex_unlock(&ctx->mutex); + printf("_common_ucx_ctx_append: ctx = %p, ctx_rec = %p\n", (void *)ctx, (void *)ctx_rec); return OPAL_SUCCESS; } @@ -611,6 +622,8 @@ _common_ucx_ctx_remove(opal_common_ucx_ctx_t *ctx, _tlocal_ctx_t *ctx_rec) * we can safely release communication context structure */ _common_ucx_ctx_free(ctx); } + printf("_common_ucx_ctx_remove: ctx = %p, ctx_rec = %p\n", (void *)ctx, (void *)ctx_rec); + return; } static int _comm_ucx_mem_map(opal_common_ucx_wpool_t *wpool, @@ -640,6 +653,7 @@ static int _comm_ucx_mem_map(opal_common_ucx_wpool_t *wpool, ret = OPAL_ERROR; return ret; } + printf("_comm_ucx_mem_map(after ucp_mem_map): memh = %p\n", (void *)(*memh_ptr)); mem_attrs.field_mask = UCP_MEM_ATTR_FIELD_ADDRESS | UCP_MEM_ATTR_FIELD_LENGTH; status = ucp_mem_query((*memh_ptr), &mem_attrs); @@ -648,6 +662,7 @@ static int _comm_ucx_mem_map(opal_common_ucx_wpool_t *wpool, ret = OPAL_ERROR; goto error; } + printf("_comm_ucx_mem_map(after ucp_mem_query): memh = %p\n", (void *)(*memh_ptr)); assert(mem_attrs.length >= size); if (mem_type != OPAL_COMMON_UCX_MEM_ALLOCATE_MAP) { @@ -656,6 +671,8 @@ static int _comm_ucx_mem_map(opal_common_ucx_wpool_t *wpool, (*base) = mem_attrs.address; } + printf("_comm_ucx_mem_map(end): wpool = %p, addr = %p size = %d memh = %p\n", + (void *)wpool, (void *)(*base), (int)size, (void *)(*memh_ptr)); return ret; error: ucp_mem_unmap(wpool->ucp_ctx, (*memh_ptr)); @@ -663,12 +680,13 @@ static int _comm_ucx_mem_map(opal_common_ucx_wpool_t *wpool, } -OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int comm_size, - void **mem_base, size_t mem_size, - opal_common_ucx_mem_type_t mem_type, - opal_common_ucx_exchange_func_t exchange_func, - void *exchange_metadata, - opal_common_ucx_mem_t **mem_ptr) +OPAL_DECLSPEC +int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int comm_size, + void **mem_base, size_t mem_size, + opal_common_ucx_mem_type_t mem_type, + opal_common_ucx_exchange_func_t exchange_func, + void *exchange_metadata, + opal_common_ucx_mem_t **mem_ptr) { opal_common_ucx_mem_t *mem = calloc(1, sizeof(*mem)); void *rkey_addr = NULL; @@ -688,6 +706,8 @@ OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int com MCA_COMMON_UCX_VERBOSE(1, "_comm_ucx_mem_map failed: %d", ret); goto error_mem_map; } + printf("opal_common_ucx_mem_create(after _comm_ucx_mem_map): base = %p, memh = %p\n", + (void *)(*mem_base), (void *)(mem->memh)); status = ucp_rkey_pack(ctx->wpool->ucp_ctx, mem->memh, &rkey_addr, &rkey_addr_len); @@ -696,15 +716,22 @@ OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int com ret = OPAL_ERROR; goto error_rkey_pack; } + printf("opal_common_ucx_mem_create(after ucp_rkey_pack): rkey_addr = %p, rkey_addr_len = %d\n", + (void *)rkey_addr, (int)rkey_addr_len); ret = exchange_func(rkey_addr, rkey_addr_len, &mem->mem_addrs, &mem->mem_displs, exchange_metadata); + printf("opal_common_ucx_mem_create(after exchange_func): rkey_addr = %p, rkey_addr_len = %d mem_addrs = %p mem_displs = %p\n", + (void *)rkey_addr, (int)rkey_addr_len, (void *)mem->mem_addrs, (void *)mem->mem_displs); + ucp_rkey_buffer_release(rkey_addr); if (ret != OPAL_SUCCESS) { goto error_rkey_pack; } (*mem_ptr) = mem; + + printf("opal_common_ucx_mem_create(end): mem = %p\n", (void *)mem); return ret; error_rkey_pack: @@ -724,6 +751,7 @@ static void _common_ucx_mem_free(opal_common_ucx_mem_t *mem) ucp_mem_unmap(mem->ctx->wpool->ucp_ctx, mem->memh); OBJ_DESTRUCT(&mem->mutex); OBJ_DESTRUCT(&mem->registrations); + printf("_common_ucx_mem_free: mem = %p\n", (void *)mem); free(mem); } @@ -739,6 +767,7 @@ _common_ucx_mem_append(opal_common_ucx_mem_t *mem, opal_mutex_lock(&mem->mutex); opal_list_append(&mem->registrations, &item->super); opal_mutex_unlock(&mem->mutex); + printf("_common_ucx_mem_append: mem = %p, mem_rec = %p\n", (void *)mem, (void *)mem_rec); return OPAL_SUCCESS; } @@ -766,6 +795,9 @@ _common_ucx_mem_remove(opal_common_ucx_mem_t *mem, _tlocal_mem_t *mem_rec) * we can safely release communication context structure */ _common_ucx_mem_free(mem); } + + printf("_common_ucx_mem_remove(end): mem = %p mem_rec = %p\n", (void *)mem, (void *)mem_rec); + return; } @@ -783,13 +815,17 @@ static _tlocal_table_t* _common_ucx_tls_init(opal_common_ucx_wpool_t *wpool) opal_list_append(&wpool->tls_list, &tls->super); opal_mutex_unlock(&wpool->mutex); - if( _tlocal_tls_ctxtbl_extend(tls, 4) ){ + if(_tlocal_tls_ctxtbl_extend(tls, 4)){ + printf("_tlocal_tls_ctxtbl_extend failed\n"); // TODO: handle error } if(_tlocal_tls_memtbl_extend(tls, 4)) { + printf("_tlocal_tls_memtbl_extend failed\n"); // TODO: handle error } + pthread_setspecific(_tlocal_key, tls); + printf("_common_ucx_tls_init(end): wpool = %p\n", (void *)wpool); return tls; } @@ -799,6 +835,7 @@ _tlocal_get_tls(opal_common_ucx_wpool_t *wpool){ if( OPAL_UNLIKELY(NULL == tls) ) { tls = _common_ucx_tls_init(wpool); } + printf("_tlocal_get_tls(end): wpool = %p tls = %p\n", (void *)wpool, (void *)tls); return tls; } @@ -819,6 +856,7 @@ _tlocal_tls_get_worker(_tlocal_table_t *tls, _worker_info_t **_winfo) winfo->comm_size = 0; } *_winfo = winfo; + printf("_tlocal_tls_get_worker(end): tls = %p winfo = %p\n", (void *)tls, (void *)winfo); return OPAL_SUCCESS; } @@ -837,6 +875,7 @@ _tlocal_tls_ctxtbl_extend(_tlocal_table_t *tbl, size_t append) } tbl->ctx_tbl_size = newsize; + printf("_tlocal_tls_ctxtbl_extend(end): tbl = %p\n", (void *)tbl); return OPAL_SUCCESS; } static int @@ -853,6 +892,7 @@ _tlocal_tls_memtbl_extend(_tlocal_table_t *tbl, size_t append) } } tbl->mem_tbl_size = newsize; + printf("_tlocal_tls_memtbl_extend(end): tbl = %p\n", (void *)tbl); return OPAL_SUCCESS; } @@ -866,6 +906,7 @@ _tlocal_ctx_search(_tlocal_table_t *tls, int ctx_id) return tls->ctx_tbl[i]; } } + printf("_tlocal_tls_memtbl_extend(end): tls = %p ctx_id = %d\n", (void *)tls, ctx_id); return NULL; } @@ -889,6 +930,7 @@ _tlocal_cleanup_ctx_record(_tlocal_ctx_t *ctx_rec) return rc; } memset(ctx_rec, 0, sizeof(*ctx_rec)); + printf("_tlocal_cleanup_ctx_record(end): ctx_rec = %p\n", (void *)ctx_rec); return OPAL_SUCCESS; } @@ -899,7 +941,7 @@ _tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx) size_t i; int rc; - /* Try to find tavailable spot in the table */ + /* Try to find available spot in the table */ for (i=0; ictx_tbl_size; i++) { if (0 == tls->ctx_tbl[i]->ctx_id) { /* Found clean record */ @@ -927,6 +969,8 @@ _tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx) //TODO: error out return NULL; } + printf("_tlocal_add_ctx(after _tlocal_tls_get_worker): tls = %p winfo = %p\n", + (void *)tls, (void *)tls->ctx_tbl[i]->winfo); tls->ctx_tbl[i]->winfo->endpoints = calloc(ctx->comm_size, sizeof(ucp_ep_h)); tls->ctx_tbl[i]->winfo->comm_size = ctx->comm_size; @@ -944,6 +988,8 @@ _tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx) //TODO: error out return NULL; } + printf("_tlocal_add_ctx(after _common_ucx_ctx_append): ctx = %p tls->ctx_tbl = %p\n", + (void *)ctx, (void *)tls->ctx_tbl); /* All good - return the record */ return tls->ctx_tbl[i]; @@ -964,13 +1010,14 @@ static int _tlocal_ctx_connect(_tlocal_ctx_t *ctx_rec, int target) displ = gctx->recv_worker_displs[target]; ep_params.address = (ucp_address_t *)&(gctx->recv_worker_addrs[displ]); status = ucp_ep_create(winfo->worker, &ep_params, &winfo->endpoints[target]); - opal_mutex_unlock(&winfo->mutex); - if (status != UCS_OK) { -// TODO: error out here -// OSC_UCX_VERBOSE(1, "ucp_ep_create failed: %d", status); + opal_mutex_unlock(&winfo->mutex); + MCA_COMMON_UCX_VERBOSE(1, "ucp_ep_create failed: %d", status); return OPAL_ERROR; } + printf("_tlocal_ctx_connect(after ucp_ep_create): worker = %p ep = %p\n", + (void *)winfo->worker, (void *)winfo->endpoints[target]); + opal_mutex_unlock(&winfo->mutex); return OPAL_SUCCESS; } @@ -987,7 +1034,11 @@ static int _tlocal_ctx_release(opal_common_ucx_ctx_t *ctx) /* May free the ctx structure. Do not use it */ _common_ucx_ctx_remove(ctx, ctx_rec); + printf("_tlocal_ctx_release(after _common_ucx_ctx_remove): ctx = %p ctx_rec = %p\n", + (void *)ctx, (void *)ctx_rec); rc = _wpool_add_to_idle(tls->wpool, ctx_rec->winfo); + printf("_tlocal_ctx_release(after _wpool_add_to_idle): wpool = %p winfo = %p\n", + (void *)tls->wpool, (void *)ctx_rec->winfo); ctx_rec->ctx_id = 0; ctx_rec->is_freed = 0; @@ -1001,6 +1052,8 @@ static inline _tlocal_mem_t * _tlocal_search_mem(_tlocal_table_t *tls, int mem_id) { size_t i; + printf("_tlocal_search_mem(begin): tls = %p mem_id = %d\n", + (void *)tls, (int)mem_id); for(i=0; imem_tbl_size; i++) { if( tls->mem_tbl[i]->mem_id == mem_id){ return tls->mem_tbl[i]; @@ -1021,10 +1074,14 @@ _tlocal_mem_record_cleanup(_tlocal_mem_t *mem_rec) * This may result in context release as we are using * delayed cleanup */ _common_ucx_mem_remove(mem_rec->gmem, mem_rec); + printf("_tlocal_mem_record_cleanup(_common_ucx_mem_remove): gmem = %p mem_rec = %p\n", + (void *)mem_rec->gmem, (void *)mem_rec); for(i = 0; i < mem_rec->gmem->ctx->comm_size; i++) { if (mem_rec->mem->rkeys[i]) { ucp_rkey_destroy(mem_rec->mem->rkeys[i]); + printf("_tlocal_mem_record_cleanup(after ucp_rkey_destroy): rkey_entry = %p\n", + (void *)mem_rec->mem->rkeys[i]); } } @@ -1041,9 +1098,9 @@ static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls, { size_t i; _tlocal_ctx_t *ctx_rec = NULL; - int rc; + int rc = OPAL_SUCCESS; - /* Try to find tavailable spot in the table */ + /* Try to find available spot in the table */ for (i=0; imem_tbl_size; i++) { if (0 == tls->mem_tbl[i]->mem_id) { /* Found a clear record */ @@ -1051,6 +1108,8 @@ static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls, if (tls->mem_tbl[i]->is_freed) { /* Found a dirty record. Need to clean it first */ _tlocal_mem_record_cleanup(tls->mem_tbl[i]); + printf("_tlocal_add_mem(after _tlocal_mem_record_cleanup): tls = %p mem_tbl_entry = %p\n", + (void *)tls, (void *)tls->mem_tbl[i]); break; } } @@ -1058,10 +1117,12 @@ static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls, if( tls->mem_tbl_size >= i ){ i = tls->mem_tbl_size; rc = _tlocal_tls_memtbl_extend(tls, 4); - if (rc) { + if (rc != OPAL_SUCCESS) { //TODO: error out return NULL; } + printf("_tlocal_add_mem(after _tlocal_tls_memtbl_extend): tls = %p\n", + (void *)tls); } tls->mem_tbl[i]->mem_id = mem->mem_id; tls->mem_tbl[i]->gmem = mem; @@ -1072,6 +1133,9 @@ static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls, // TODO: act accordingly - cleanup return NULL; } + printf("_tlocal_add_mem(after _tlocal_ctx_search): tls = %p, ctx_id = %d\n", + (void *)tls, (int)mem->ctx->ctx_id); + tls->mem_tbl[i]->mem->worker = ctx_rec->winfo; tls->mem_tbl[i]->mem->rkeys = calloc(mem->ctx->comm_size, sizeof(*tls->mem_tbl[i]->mem->rkeys)); @@ -1089,6 +1153,8 @@ static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls, // TODO: error handling return NULL; } + printf("_tlocal_add_mem(after _common_ucx_mem_append): mem = %p, mem_tbl_entry = %p\n", + (void *)mem, (void *)tls->mem_tbl[i]); return tls->mem_tbl[i]; } @@ -1103,10 +1169,11 @@ static int _tlocal_mem_create_rkey(_tlocal_mem_t *mem_rec, ucp_ep_h ep, int targ status = ucp_ep_rkey_unpack(ep, &gmem->mem_addrs[displ], &minfo->rkeys[target]); if (status != UCS_OK) { - // TODO: error out here - // OSC_UCX_VERBOSE(1, "ucp_ep_create failed: %d", status); + MCA_COMMON_UCX_VERBOSE(1, "ucp_ep_rkey_unpack failed: %d", status); return OPAL_ERROR; } + printf("_tlocal_mem_create_rkey(after ucp_ep_rkey_unpack): mem_rec = %p ep = %p target = %d\n", + (void *)mem_rec, (void *)ep, target); return OPAL_SUCCESS; } @@ -1115,41 +1182,45 @@ static inline int _tlocal_fetch(opal_common_ucx_mem_t *mem, int target, _worker_info_t **_winfo) { _tlocal_table_t *tls = NULL; - _tlocal_ctx_t *ctx_rec; - _worker_info_t *winfo; - _tlocal_mem_t *mem_rec; - _mem_info_t *mem_info; + _tlocal_ctx_t *ctx_rec = NULL; + _worker_info_t *winfo = NULL; + _tlocal_mem_t *mem_rec = NULL; + _mem_info_t *mem_info = NULL; ucp_ep_h ep; ucp_rkey_h rkey; - int rc; + int rc = OPAL_SUCCESS; tls = _tlocal_get_tls(mem->ctx->wpool); + /* Obtain the worker structure */ ctx_rec = _tlocal_ctx_search(tls, mem->ctx->ctx_id); + printf("_tlocal_fetch(after _tlocal_ctx_search): tls = %p ctx_id = %d\n", (void *)tls, (int)mem->ctx->ctx_id); if (OPAL_UNLIKELY(NULL == ctx_rec)) { ctx_rec = _tlocal_add_ctx(tls, mem->ctx); if (NULL == ctx_rec) { - // TODO: err handling return OPAL_ERR_OUT_OF_RESOURCE; } + printf("_tlocal_fetch(after _tlocal_add_ctx): tls = %p ctx = %p\n", (void *)tls, (void *)mem->ctx); } winfo = ctx_rec->winfo; /* Obtain the endpoint */ if (OPAL_UNLIKELY(NULL == winfo->endpoints[target])) { rc = _tlocal_ctx_connect(ctx_rec, target); - if (rc) { + if (rc != OPAL_SUCCESS) { return rc; } + printf("_tlocal_fetch(after _tlocal_ctx_connect): ctx_rec = %p target = %d\n", (void *)ctx_rec, target); } ep = winfo->endpoints[target]; /* Obtain the memory region info */ mem_rec = _tlocal_search_mem(tls, mem->mem_id); + printf("_tlocal_fetch(after _tlocal_search_mem): tls = %p mem_id = %d\n", (void *)tls, (int)mem->mem_id); if (OPAL_UNLIKELY(mem_rec == NULL)) { mem_rec = _tlocal_add_mem(tls, mem); + printf("_tlocal_fetch(after _tlocal_add_mem): tls = %p mem = %p\n", (void *)tls, (void *)mem); if (NULL == mem_rec) { - // TODO: err handling return OPAL_ERR_OUT_OF_RESOURCE; } } @@ -1157,57 +1228,69 @@ static inline int _tlocal_fetch(opal_common_ucx_mem_t *mem, int target, /* Obtain the rkey */ if (OPAL_UNLIKELY(NULL == mem_info->rkeys[target])) { - // Create the rkey + /* Create the rkey */ rc = _tlocal_mem_create_rkey(mem_rec, ep, target); if (rc) { return rc; } + printf("_tlocal_fetch(after _tlocal_mem_create_rkey): mem_rec = %p ep = %p, target = %d\n", + (void *)mem_rec, (void *)ep, target); } *_ep = ep; *_rkey = rkey = mem_info->rkeys[target]; *_winfo = winfo; + + printf("_tlocal_fetch(end): ep = %p, rkey = %p, winfo = %p\n", + (void *)ep, (void *)rkey, (void *)winfo); + return OPAL_SUCCESS; } -OPAL_DECLSPEC int opal_common_ucx_mem_putget(opal_common_ucx_mem_t *mem, - opal_common_ucx_op_t op, - int target, - void *buffer, size_t len, - uint64_t rem_addr) +OPAL_DECLSPEC int +opal_common_ucx_mem_putget(opal_common_ucx_mem_t *mem, + opal_common_ucx_op_t op, + int target, void *buffer, size_t len, + uint64_t rem_addr) { ucp_ep_h ep; ucp_rkey_h rkey; ucs_status_t status; _worker_info_t *winfo; - int rc; + int rc = OPAL_SUCCESS; rc =_tlocal_fetch(mem, target, &ep, &rkey, &winfo); - if( rc ){ - // TODO: err handling + if(OPAL_SUCCESS != rc){ + MCA_COMMON_UCX_VERBOSE(1, "tlocal_fetch failed: %d", rc); return rc; } + printf("opal_common_ucx_mem_putget(after _tlocal_fetch): mem = %p, ep = %p, rkey = %p, winfo = %p\n", + (void *)mem, (void *)ep, (void *)rkey, (void *)winfo); - /* Perform the operation */ + /* Perform the operation */ opal_mutex_lock(&winfo->mutex); switch(op){ - case OPAL_COMMON_UCX_GET: + case OPAL_COMMON_UCX_PUT: status = ucp_put_nbi(ep, buffer,len, rem_addr, rkey); if (status != UCS_OK && status != UCS_INPROGRESS) { - // TODO: Fix the output - // OSC_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status); + MCA_COMMON_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status); + opal_mutex_unlock(&winfo->mutex); return OPAL_ERROR; } + printf("opal_common_ucx_mem_putget(after ucp_put_nbi): ep = %p, rkey = %p\n", + (void *)ep, (void *)rkey); break; - case OPAL_COMMON_UCX_PUT: + case OPAL_COMMON_UCX_GET: status = ucp_get_nbi(ep, buffer,len, rem_addr, rkey); if (status != UCS_OK && status != UCS_INPROGRESS) { - // TODO: Fix the output - // OSC_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status); + MCA_COMMON_UCX_VERBOSE(1, "ucp_get_nbi failed: %d", status); + opal_mutex_unlock(&winfo->mutex); return OPAL_ERROR; } + printf("opal_common_ucx_mem_putget(after ucp_get_nbi): ep = %p, rkey = %p\n", + (void *)ep, (void *)rkey); break; } opal_mutex_unlock(&winfo->mutex); @@ -1215,23 +1298,25 @@ OPAL_DECLSPEC int opal_common_ucx_mem_putget(opal_common_ucx_mem_t *mem, } -OPAL_DECLSPEC int opal_common_ucx_mem_cmpswp(opal_common_ucx_mem_t *mem, - uint64_t compare, uint64_t value, - int target, - void *buffer, size_t len, - uint64_t rem_addr) +OPAL_DECLSPEC +int opal_common_ucx_mem_cmpswp(opal_common_ucx_mem_t *mem, + uint64_t compare, uint64_t value, + int target, void *buffer, size_t len, + uint64_t rem_addr) { ucp_ep_h ep; ucp_rkey_h rkey; - _worker_info_t *winfo; + _worker_info_t *winfo = NULL; ucs_status_t status; - int rc; + int rc = OPAL_SUCCESS; rc =_tlocal_fetch(mem, target, &ep, &rkey, &winfo); - if( rc ){ - // TODO: err handling + if(OPAL_SUCCESS != rc){ + MCA_COMMON_UCX_VERBOSE(1, "tlocal_fetch failed: %d", rc); return rc; } + printf("opal_common_ucx_mem_cmpswp(after _tlocal_fetch): mem = %p, ep = %p, rkey = %p, winfo = %p\n", + (void *)mem, (void *)ep, (void *)rkey, (void *)winfo); /* Perform the operation */ opal_mutex_lock(&winfo->mutex); @@ -1240,30 +1325,36 @@ OPAL_DECLSPEC int opal_common_ucx_mem_cmpswp(opal_common_ucx_mem_t *mem, rem_addr, rkey, winfo->worker); if (status != UCS_OK) { - // TODO: OSC_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); + MCA_COMMON_UCX_VERBOSE(1, "opal_common_ucx_atomic_cswap failed: %d", status); + opal_mutex_unlock(&winfo->mutex); return OPAL_ERROR; } + printf("opal_common_ucx_mem_cmpswp(after opal_common_ucx_atomic_cswap): ep = %p, rkey = %p\n", + (void *)ep, (void *)rkey); + opal_mutex_unlock(&winfo->mutex); return OPAL_SUCCESS; } -OPAL_DECLSPEC int opal_common_ucx_mem_fetch(opal_common_ucx_mem_t *mem, - ucp_atomic_fetch_op_t opcode, uint64_t value, - int target, - void *buffer, size_t len, - uint64_t rem_addr) +OPAL_DECLSPEC +int opal_common_ucx_mem_fetch(opal_common_ucx_mem_t *mem, + ucp_atomic_fetch_op_t opcode, uint64_t value, + int target, void *buffer, size_t len, + uint64_t rem_addr) { - ucp_ep_h ep; - ucp_rkey_h rkey; - _worker_info_t *winfo; + ucp_ep_h ep = NULL; + ucp_rkey_h rkey = NULL; + _worker_info_t *winfo = NULL; ucs_status_t status; - int rc; + int rc = OPAL_SUCCESS; rc =_tlocal_fetch(mem, target, &ep, &rkey, &winfo); - if( rc ){ - // TODO: err handling + if(OPAL_SUCCESS != rc){ + MCA_COMMON_UCX_VERBOSE(1, "tlocal_fetch failed: %d", rc); return rc; } + printf("opal_common_ucx_mem_fetch(after _tlocal_fetch): mem = %p, ep = %p, rkey = %p, winfo = %p\n", + (void *)mem, (void *)ep, (void *)rkey, (void *)winfo); /* Perform the operation */ opal_mutex_lock(&winfo->mutex); @@ -1272,87 +1363,108 @@ OPAL_DECLSPEC int opal_common_ucx_mem_fetch(opal_common_ucx_mem_t *mem, rem_addr, rkey, winfo->worker); if (status != UCS_OK) { - // TODO: OSC_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); + opal_mutex_unlock(&winfo->mutex); + MCA_COMMON_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); return OPAL_ERROR; } + printf("opal_common_ucx_mem_fetch(after opal_common_ucx_atomic_fetch): ep = %p, rkey = %p\n", + (void *)ep, (void *)rkey); + opal_mutex_unlock(&winfo->mutex); + return OPAL_SUCCESS; } -//ucs_status_t ucp_atomic_post(ucp_ep_h ep, ucp_atomic_post_op_t opcode, uint64_t value, -// size_t op_size, uint64_t remote_addr, ucp_rkey_h rkey); - -OPAL_DECLSPEC int opal_common_ucx_mem_post(opal_common_ucx_mem_t *mem, - ucp_atomic_post_op_t opcode, - uint64_t value, - int target, - size_t len, - uint64_t rem_addr) +OPAL_DECLSPEC +int opal_common_ucx_mem_post(opal_common_ucx_mem_t *mem, + ucp_atomic_post_op_t opcode, + uint64_t value, int target, size_t len, + uint64_t rem_addr) { ucp_ep_h ep; ucp_rkey_h rkey; - _worker_info_t *winfo; + _worker_info_t *winfo = NULL; ucs_status_t status; - int rc; + int rc = OPAL_SUCCESS; rc =_tlocal_fetch(mem, target, &ep, &rkey, &winfo); - if( rc ){ - // TODO: err handling + if(OPAL_SUCCESS != rc){ + MCA_COMMON_UCX_VERBOSE(1, "tlocal_fetch failed: %d", rc); return rc; } + printf("opal_common_ucx_mem_post(after _tlocal_fetch): mem = %p, ep = %p, rkey = %p, winfo = %p\n", + (void *)mem, (void *)ep, (void *)rkey, (void *)winfo); /* Perform the operation */ opal_mutex_lock(&winfo->mutex); status = ucp_atomic_post(ep, opcode, value, - len, - rem_addr, rkey); + len, rem_addr, rkey); if (status != UCS_OK) { - // TODO: OSC_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); + opal_mutex_unlock(&winfo->mutex); + MCA_COMMON_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); return OPAL_ERROR; } + printf("opal_common_ucx_mem_post(after ucp_atomic_post): ep = %p, rkey = %p\n", (void *)ep, (void *)rkey); opal_mutex_unlock(&winfo->mutex); + return OPAL_SUCCESS; } - -// TODO: return sttaus -OPAL_DECLSPEC void +OPAL_DECLSPEC int opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, opal_common_ucx_flush_scope_t scope, int target) { _worker_list_item_t *item; opal_common_ucx_ctx_t *ctx = mem->ctx; + int rc = OPAL_SUCCESS; - /* - volatile int delay = 1; - while( delay ){ - sleep(1); - } - */ + printf("opal_common_ucx_mem_flush: mem = %p, target = %d\n", (void *)mem, target); opal_mutex_lock(&ctx->mutex); OPAL_LIST_FOREACH(item, &ctx->workers, _worker_list_item_t) { switch (scope) { case OPAL_COMMON_UCX_SCOPE_WORKER: opal_mutex_lock(&item->ptr->winfo->mutex); - opal_common_ucx_worker_flush(item->ptr->winfo->worker); + rc = opal_common_ucx_worker_flush(item->ptr->winfo->worker); + if (rc != OPAL_SUCCESS) { + opal_mutex_unlock(&item->ptr->winfo->mutex); + opal_mutex_unlock(&ctx->mutex); + MCA_COMMON_UCX_VERBOSE(1, "opal_common_ucx_worker_flush failed: %d", rc); + return OPAL_ERROR; + } + printf("opal_common_ucx_mem_flush(after opal_common_ucx_worker_flush): worker = %p\n", + (void *)item->ptr->winfo->worker); opal_mutex_unlock(&item->ptr->winfo->mutex); break; case OPAL_COMMON_UCX_SCOPE_EP: if (NULL != item->ptr->winfo->endpoints[target] ) { opal_mutex_lock(&item->ptr->winfo->mutex); - opal_common_ucx_ep_flush(item->ptr->winfo->endpoints[target], - item->ptr->winfo->worker); + rc = opal_common_ucx_ep_flush(item->ptr->winfo->endpoints[target], + item->ptr->winfo->worker); + if (rc != OPAL_SUCCESS) { + opal_mutex_unlock(&item->ptr->winfo->mutex); + opal_mutex_unlock(&ctx->mutex); + MCA_COMMON_UCX_VERBOSE(1, "opal_common_ucx_ep_flush failed: %d", rc); + return OPAL_ERROR; + } + printf("opal_common_ucx_mem_flush(after opal_common_ucx_worker_flush): ep = %p worker = %p\n", + (void *)item->ptr->winfo->endpoints[target], + (void *)item->ptr->winfo->worker); opal_mutex_unlock(&item->ptr->winfo->mutex); } } } opal_mutex_unlock(&ctx->mutex); + + return rc; } -OPAL_DECLSPEC int opal_common_ucx_workers_progress(opal_common_ucx_wpool_t *wpool) { +OPAL_DECLSPEC +int opal_common_ucx_workers_progress(opal_common_ucx_wpool_t *wpool) { // TODO - return 0; + printf("opal_common_ucx_workres_progress: wpool = %p\n", (void *)wpool); + return OPAL_SUCCESS; } + diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index fac588390d7..b586292fc8e 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -165,9 +165,9 @@ OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int com opal_common_ucx_exchange_func_t exchange_func, void *exchange_metadata, opal_common_ucx_mem_t **mem_ptr); -OPAL_DECLSPEC void opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, - opal_common_ucx_flush_scope_t scope, - int target); +OPAL_DECLSPEC int opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, + opal_common_ucx_flush_scope_t scope, + int target); OPAL_DECLSPEC int opal_common_ucx_workers_progress(opal_common_ucx_wpool_t *wpool); OPAL_DECLSPEC int opal_common_ucx_mem_cmpswp(opal_common_ucx_mem_t *mem, uint64_t compare, uint64_t value, From 091972c18791480a323496c70b037dfe6368b05b Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 14:21:44 -0800 Subject: [PATCH 22/59] fix --- ompi/mca/osc/ucx/osc_ucx_comm.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_comm.c b/ompi/mca/osc/ucx/osc_ucx_comm.c index 7f93ef42218..2e57c34a136 100644 --- a/ompi/mca/osc/ucx/osc_ucx_comm.c +++ b/ompi/mca/osc/ucx/osc_ucx_comm.c @@ -67,18 +67,14 @@ static inline int check_sync_state(ompi_osc_ucx_module_t *module, int target, } static inline int incr_and_check_ops_num(ompi_osc_ucx_module_t *module, int target) { - //int status; - + int status; module->global_ops_num++; module->per_target_ops_nums[target]++; if (module->global_ops_num >= OSC_UCX_OPS_THRESHOLD) { - // TODO: - /*status =*/ opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target); -/* + status = opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target); if (status != OMPI_SUCCESS) { return status; } -*/ module->global_ops_num -= module->per_target_ops_nums[target]; module->per_target_ops_nums[target] = 0; } From 24623c98cd4d1bfa67c33f84ae8dbf042c052c46 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 15:30:58 -0800 Subject: [PATCH 23/59] add printf msg --- opal/mca/common/ucx/common_ucx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 0f364b05a0d..a05959d8aad 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -1464,7 +1464,11 @@ opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, OPAL_DECLSPEC int opal_common_ucx_workers_progress(opal_common_ucx_wpool_t *wpool) { // TODO - printf("opal_common_ucx_workres_progress: wpool = %p\n", (void *)wpool); + static int enter = 0; + if (enter == 0) { + printf("opal_common_ucx_workres_progress: wpool = %p\n", (void *)wpool); + } + enter++; return OPAL_SUCCESS; } From 836ec01913aa87521da4b6538893eab46fffcd8f Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 15:33:15 -0800 Subject: [PATCH 24/59] TEST --- opal/mca/common/ucx/common_ucx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index a05959d8aad..a9f3006c5cb 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -1468,6 +1468,7 @@ int opal_common_ucx_workers_progress(opal_common_ucx_wpool_t *wpool) { if (enter == 0) { printf("opal_common_ucx_workres_progress: wpool = %p\n", (void *)wpool); } + enter++; return OPAL_SUCCESS; } From fbadd86167fcb332dddd35eaa55144ba13d6adff Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 16:00:40 -0800 Subject: [PATCH 25/59] fix dbg out --- opal/mca/common/ucx/common_ucx.c | 168 +++++++++++++++++++------------ 1 file changed, 103 insertions(+), 65 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index a9f3006c5cb..e974e47fd4c 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -86,6 +86,39 @@ OBJ_CLASS_INSTANCE(_tlocal_table_t, opal_list_item_t, NULL, NULL); static pthread_key_t _tlocal_key = {0}; + +#define FDBG +#ifdef FDBG +__thread FILE *tls_pf = NULL; +__thread int initialized = 0; + +#include + +void init_tls_dbg(void) +{ + if( !initialized ) { + int tid = syscall(__NR_gettid); + char hname[128]; + gethostname(hname, 127); + char fname[128]; + + sprintf(fname, "%s.%d.log", hname, tid); + tls_pf = fopen(fname, "w"); + initialized = 1; + } +} + +#define DBG_OUT(...) \ +{ \ + init_tls_dbg(); \ + fprintf(tls_pf, __VA_ARGS__); \ +} + +#else +#define DBG_OUT(...) +#endif + + static int _tlocal_tls_ctxtbl_extend(_tlocal_table_t *tbl, size_t append); static int _tlocal_tls_memtbl_extend(_tlocal_table_t *tbl, size_t append); static _tlocal_table_t* _common_ucx_tls_init(opal_common_ucx_wpool_t *wpool); @@ -323,7 +356,7 @@ ucp_worker_h _create_ctx_worker(opal_common_ucx_wpool_t *wpool) return NULL; } - printf("_create_ctx_worker: worker = %p\n", (void *)worker); + DBG_OUT("_create_ctx_worker: worker = %p\n", (void *)worker); return worker; } @@ -353,7 +386,7 @@ int _wpool_add_to_idle(opal_common_ucx_wpool_t *wpool, _worker_info_t *winfo) opal_list_append(&wpool->idle_workers, &item->super); opal_mutex_unlock(&wpool->mutex); - printf("_wpool_add_to_idle: wpool = %p winfo = %p\n", (void *)wpool, (void *)winfo); + DBG_OUT("_wpool_add_to_idle: wpool = %p winfo = %p\n", (void *)wpool, (void *)winfo); return OPAL_SUCCESS; } @@ -375,17 +408,21 @@ _worker_info_t* _wpool_remove_from_idle(opal_common_ucx_wpool_t *wpool) OBJ_RELEASE(item); } - printf("_wpool_remove_from_idle: wpool = %p\n", (void *)wpool); + DBG_OUT("_wpool_remove_from_idle: wpool = %p\n", (void *)wpool); return wkr; } + + + + OPAL_DECLSPEC opal_common_ucx_wpool_t * opal_common_ucx_wpool_allocate(void) { opal_common_ucx_wpool_t *ptr = calloc(1, sizeof(opal_common_ucx_wpool_t)); ptr->refcnt = 0; - printf("opal_common_ucx_wpool_allocate: wpool = %p\n", (void *)ptr); + DBG_OUT("opal_common_ucx_wpool_allocate: wpool = %p\n", (void *)ptr); return ptr; } @@ -394,7 +431,8 @@ void opal_common_ucx_wpool_free(opal_common_ucx_wpool_t *wpool) { assert(wpool->refcnt == 0); - printf("opal_common_ucx_wpool_free: wpool = %p\n", (void *)wpool); + DBG_OUT("opal_common_ucx_wpool_free: wpool = %p\n", (void *)wpool); + free(wpool); } @@ -478,7 +516,7 @@ int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, pthread_key_create(&_tlocal_key, _cleanup_tlocal); - printf("opal_common_ucx_wpool_init: wpool = %p\n", (void *)wpool); + DBG_OUT("opal_common_ucx_wpool_init: wpool = %p\n", (void *)wpool); return rc; err_wpool_add: @@ -498,7 +536,7 @@ void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) { wpool->refcnt--; if (wpool->refcnt > 0) { - printf("opal_common_ucx_wpool_finalize: wpool = %p\n", (void *)wpool); + DBG_OUT("opal_common_ucx_wpool_finalize: wpool = %p\n", (void *)wpool); return; } @@ -524,7 +562,7 @@ void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) ucp_worker_release_address(wpool->recv_worker, wpool->recv_waddr); ucp_worker_destroy(wpool->recv_worker); ucp_cleanup(wpool->ucp_ctx); - printf("opal_common_ucx_wpool_finalize: wpool = %p\n", (void *)wpool); + DBG_OUT("opal_common_ucx_wpool_finalize: wpool = %p\n", (void *)wpool); return; } @@ -554,7 +592,7 @@ int opal_common_ucx_ctx_create(opal_common_ucx_wpool_t *wpool, int comm_size, } (*ctx_ptr) = ctx; - printf("opal_common_ucx_wpool_create: wpool = %p, (*ctx_ptr) = %p\n", (void *)wpool, (void *)(*ctx_ptr)); + DBG_OUT("opal_common_ucx_wpool_create: wpool = %p, (*ctx_ptr) = %p\n", (void *)wpool, (void *)(*ctx_ptr)); return ret; error: @@ -571,7 +609,7 @@ static void _common_ucx_ctx_free(opal_common_ucx_ctx_t *ctx) free(ctx->recv_worker_displs); OBJ_DESTRUCT(&ctx->mutex); OBJ_DESTRUCT(&ctx->workers); - printf("_common_ucx_ctx_free: ctx = %p\n", (void *)ctx); + DBG_OUT("_common_ucx_ctx_free: ctx = %p\n", (void *)ctx); free(ctx); } @@ -579,7 +617,7 @@ OPAL_DECLSPEC void opal_common_ucx_ctx_release(opal_common_ucx_ctx_t *ctx) { // TODO: implement - printf("opal_common_ucx_ctx_release: ctx = %p\n", (void *)ctx); + DBG_OUT("opal_common_ucx_ctx_release: ctx = %p\n", (void *)ctx); _tlocal_ctx_release(ctx); } @@ -594,7 +632,7 @@ _common_ucx_ctx_append(opal_common_ucx_ctx_t *ctx, _tlocal_ctx_t *ctx_rec) opal_mutex_lock(&ctx->mutex); opal_list_append(&ctx->workers, &item->super); opal_mutex_unlock(&ctx->mutex); - printf("_common_ucx_ctx_append: ctx = %p, ctx_rec = %p\n", (void *)ctx, (void *)ctx_rec); + DBG_OUT("_common_ucx_ctx_append: ctx = %p, ctx_rec = %p\n", (void *)ctx, (void *)ctx_rec); return OPAL_SUCCESS; } @@ -622,7 +660,7 @@ _common_ucx_ctx_remove(opal_common_ucx_ctx_t *ctx, _tlocal_ctx_t *ctx_rec) * we can safely release communication context structure */ _common_ucx_ctx_free(ctx); } - printf("_common_ucx_ctx_remove: ctx = %p, ctx_rec = %p\n", (void *)ctx, (void *)ctx_rec); + DBG_OUT("_common_ucx_ctx_remove: ctx = %p, ctx_rec = %p\n", (void *)ctx, (void *)ctx_rec); return; } @@ -653,7 +691,7 @@ static int _comm_ucx_mem_map(opal_common_ucx_wpool_t *wpool, ret = OPAL_ERROR; return ret; } - printf("_comm_ucx_mem_map(after ucp_mem_map): memh = %p\n", (void *)(*memh_ptr)); + DBG_OUT("_comm_ucx_mem_map(after ucp_mem_map): memh = %p\n", (void *)(*memh_ptr)); mem_attrs.field_mask = UCP_MEM_ATTR_FIELD_ADDRESS | UCP_MEM_ATTR_FIELD_LENGTH; status = ucp_mem_query((*memh_ptr), &mem_attrs); @@ -662,7 +700,7 @@ static int _comm_ucx_mem_map(opal_common_ucx_wpool_t *wpool, ret = OPAL_ERROR; goto error; } - printf("_comm_ucx_mem_map(after ucp_mem_query): memh = %p\n", (void *)(*memh_ptr)); + DBG_OUT("_comm_ucx_mem_map(after ucp_mem_query): memh = %p\n", (void *)(*memh_ptr)); assert(mem_attrs.length >= size); if (mem_type != OPAL_COMMON_UCX_MEM_ALLOCATE_MAP) { @@ -671,7 +709,7 @@ static int _comm_ucx_mem_map(opal_common_ucx_wpool_t *wpool, (*base) = mem_attrs.address; } - printf("_comm_ucx_mem_map(end): wpool = %p, addr = %p size = %d memh = %p\n", + DBG_OUT("_comm_ucx_mem_map(end): wpool = %p, addr = %p size = %d memh = %p\n", (void *)wpool, (void *)(*base), (int)size, (void *)(*memh_ptr)); return ret; error: @@ -706,7 +744,7 @@ int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int comm_size, MCA_COMMON_UCX_VERBOSE(1, "_comm_ucx_mem_map failed: %d", ret); goto error_mem_map; } - printf("opal_common_ucx_mem_create(after _comm_ucx_mem_map): base = %p, memh = %p\n", + DBG_OUT("opal_common_ucx_mem_create(after _comm_ucx_mem_map): base = %p, memh = %p\n", (void *)(*mem_base), (void *)(mem->memh)); status = ucp_rkey_pack(ctx->wpool->ucp_ctx, mem->memh, @@ -716,12 +754,12 @@ int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int comm_size, ret = OPAL_ERROR; goto error_rkey_pack; } - printf("opal_common_ucx_mem_create(after ucp_rkey_pack): rkey_addr = %p, rkey_addr_len = %d\n", + DBG_OUT("opal_common_ucx_mem_create(after ucp_rkey_pack): rkey_addr = %p, rkey_addr_len = %d\n", (void *)rkey_addr, (int)rkey_addr_len); ret = exchange_func(rkey_addr, rkey_addr_len, &mem->mem_addrs, &mem->mem_displs, exchange_metadata); - printf("opal_common_ucx_mem_create(after exchange_func): rkey_addr = %p, rkey_addr_len = %d mem_addrs = %p mem_displs = %p\n", + DBG_OUT("opal_common_ucx_mem_create(after exchange_func): rkey_addr = %p, rkey_addr_len = %d mem_addrs = %p mem_displs = %p\n", (void *)rkey_addr, (int)rkey_addr_len, (void *)mem->mem_addrs, (void *)mem->mem_displs); ucp_rkey_buffer_release(rkey_addr); @@ -731,7 +769,7 @@ int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int comm_size, (*mem_ptr) = mem; - printf("opal_common_ucx_mem_create(end): mem = %p\n", (void *)mem); + DBG_OUT("opal_common_ucx_mem_create(end): mem = %p\n", (void *)mem); return ret; error_rkey_pack: @@ -751,7 +789,7 @@ static void _common_ucx_mem_free(opal_common_ucx_mem_t *mem) ucp_mem_unmap(mem->ctx->wpool->ucp_ctx, mem->memh); OBJ_DESTRUCT(&mem->mutex); OBJ_DESTRUCT(&mem->registrations); - printf("_common_ucx_mem_free: mem = %p\n", (void *)mem); + DBG_OUT("_common_ucx_mem_free: mem = %p\n", (void *)mem); free(mem); } @@ -767,7 +805,7 @@ _common_ucx_mem_append(opal_common_ucx_mem_t *mem, opal_mutex_lock(&mem->mutex); opal_list_append(&mem->registrations, &item->super); opal_mutex_unlock(&mem->mutex); - printf("_common_ucx_mem_append: mem = %p, mem_rec = %p\n", (void *)mem, (void *)mem_rec); + DBG_OUT("_common_ucx_mem_append: mem = %p, mem_rec = %p\n", (void *)mem, (void *)mem_rec); return OPAL_SUCCESS; } @@ -796,7 +834,7 @@ _common_ucx_mem_remove(opal_common_ucx_mem_t *mem, _tlocal_mem_t *mem_rec) _common_ucx_mem_free(mem); } - printf("_common_ucx_mem_remove(end): mem = %p mem_rec = %p\n", (void *)mem, (void *)mem_rec); + DBG_OUT("_common_ucx_mem_remove(end): mem = %p mem_rec = %p\n", (void *)mem, (void *)mem_rec); return; } @@ -816,16 +854,16 @@ static _tlocal_table_t* _common_ucx_tls_init(opal_common_ucx_wpool_t *wpool) opal_mutex_unlock(&wpool->mutex); if(_tlocal_tls_ctxtbl_extend(tls, 4)){ - printf("_tlocal_tls_ctxtbl_extend failed\n"); + DBG_OUT("_tlocal_tls_ctxtbl_extend failed\n"); // TODO: handle error } if(_tlocal_tls_memtbl_extend(tls, 4)) { - printf("_tlocal_tls_memtbl_extend failed\n"); + DBG_OUT("_tlocal_tls_memtbl_extend failed\n"); // TODO: handle error } pthread_setspecific(_tlocal_key, tls); - printf("_common_ucx_tls_init(end): wpool = %p\n", (void *)wpool); + DBG_OUT("_common_ucx_tls_init(end): wpool = %p\n", (void *)wpool); return tls; } @@ -835,7 +873,7 @@ _tlocal_get_tls(opal_common_ucx_wpool_t *wpool){ if( OPAL_UNLIKELY(NULL == tls) ) { tls = _common_ucx_tls_init(wpool); } - printf("_tlocal_get_tls(end): wpool = %p tls = %p\n", (void *)wpool, (void *)tls); + DBG_OUT("_tlocal_get_tls(end): wpool = %p tls = %p\n", (void *)wpool, (void *)tls); return tls; } @@ -856,7 +894,7 @@ _tlocal_tls_get_worker(_tlocal_table_t *tls, _worker_info_t **_winfo) winfo->comm_size = 0; } *_winfo = winfo; - printf("_tlocal_tls_get_worker(end): tls = %p winfo = %p\n", (void *)tls, (void *)winfo); + DBG_OUT("_tlocal_tls_get_worker(end): tls = %p winfo = %p\n", (void *)tls, (void *)winfo); return OPAL_SUCCESS; } @@ -875,7 +913,7 @@ _tlocal_tls_ctxtbl_extend(_tlocal_table_t *tbl, size_t append) } tbl->ctx_tbl_size = newsize; - printf("_tlocal_tls_ctxtbl_extend(end): tbl = %p\n", (void *)tbl); + DBG_OUT("_tlocal_tls_ctxtbl_extend(end): tbl = %p\n", (void *)tbl); return OPAL_SUCCESS; } static int @@ -892,7 +930,7 @@ _tlocal_tls_memtbl_extend(_tlocal_table_t *tbl, size_t append) } } tbl->mem_tbl_size = newsize; - printf("_tlocal_tls_memtbl_extend(end): tbl = %p\n", (void *)tbl); + DBG_OUT("_tlocal_tls_memtbl_extend(end): tbl = %p\n", (void *)tbl); return OPAL_SUCCESS; } @@ -906,7 +944,7 @@ _tlocal_ctx_search(_tlocal_table_t *tls, int ctx_id) return tls->ctx_tbl[i]; } } - printf("_tlocal_tls_memtbl_extend(end): tls = %p ctx_id = %d\n", (void *)tls, ctx_id); + DBG_OUT("_tlocal_tls_memtbl_extend(end): tls = %p ctx_id = %d\n", (void *)tls, ctx_id); return NULL; } @@ -930,7 +968,7 @@ _tlocal_cleanup_ctx_record(_tlocal_ctx_t *ctx_rec) return rc; } memset(ctx_rec, 0, sizeof(*ctx_rec)); - printf("_tlocal_cleanup_ctx_record(end): ctx_rec = %p\n", (void *)ctx_rec); + DBG_OUT("_tlocal_cleanup_ctx_record(end): ctx_rec = %p\n", (void *)ctx_rec); return OPAL_SUCCESS; } @@ -969,7 +1007,7 @@ _tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx) //TODO: error out return NULL; } - printf("_tlocal_add_ctx(after _tlocal_tls_get_worker): tls = %p winfo = %p\n", + DBG_OUT("_tlocal_add_ctx(after _tlocal_tls_get_worker): tls = %p winfo = %p\n", (void *)tls, (void *)tls->ctx_tbl[i]->winfo); tls->ctx_tbl[i]->winfo->endpoints = calloc(ctx->comm_size, sizeof(ucp_ep_h)); tls->ctx_tbl[i]->winfo->comm_size = ctx->comm_size; @@ -988,7 +1026,7 @@ _tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx) //TODO: error out return NULL; } - printf("_tlocal_add_ctx(after _common_ucx_ctx_append): ctx = %p tls->ctx_tbl = %p\n", + DBG_OUT("_tlocal_add_ctx(after _common_ucx_ctx_append): ctx = %p tls->ctx_tbl = %p\n", (void *)ctx, (void *)tls->ctx_tbl); /* All good - return the record */ @@ -1015,7 +1053,7 @@ static int _tlocal_ctx_connect(_tlocal_ctx_t *ctx_rec, int target) MCA_COMMON_UCX_VERBOSE(1, "ucp_ep_create failed: %d", status); return OPAL_ERROR; } - printf("_tlocal_ctx_connect(after ucp_ep_create): worker = %p ep = %p\n", + DBG_OUT("_tlocal_ctx_connect(after ucp_ep_create): worker = %p ep = %p\n", (void *)winfo->worker, (void *)winfo->endpoints[target]); opal_mutex_unlock(&winfo->mutex); return OPAL_SUCCESS; @@ -1034,10 +1072,10 @@ static int _tlocal_ctx_release(opal_common_ucx_ctx_t *ctx) /* May free the ctx structure. Do not use it */ _common_ucx_ctx_remove(ctx, ctx_rec); - printf("_tlocal_ctx_release(after _common_ucx_ctx_remove): ctx = %p ctx_rec = %p\n", + DBG_OUT("_tlocal_ctx_release(after _common_ucx_ctx_remove): ctx = %p ctx_rec = %p\n", (void *)ctx, (void *)ctx_rec); rc = _wpool_add_to_idle(tls->wpool, ctx_rec->winfo); - printf("_tlocal_ctx_release(after _wpool_add_to_idle): wpool = %p winfo = %p\n", + DBG_OUT("_tlocal_ctx_release(after _wpool_add_to_idle): wpool = %p winfo = %p\n", (void *)tls->wpool, (void *)ctx_rec->winfo); ctx_rec->ctx_id = 0; @@ -1052,7 +1090,7 @@ static inline _tlocal_mem_t * _tlocal_search_mem(_tlocal_table_t *tls, int mem_id) { size_t i; - printf("_tlocal_search_mem(begin): tls = %p mem_id = %d\n", + DBG_OUT("_tlocal_search_mem(begin): tls = %p mem_id = %d\n", (void *)tls, (int)mem_id); for(i=0; imem_tbl_size; i++) { if( tls->mem_tbl[i]->mem_id == mem_id){ @@ -1074,13 +1112,13 @@ _tlocal_mem_record_cleanup(_tlocal_mem_t *mem_rec) * This may result in context release as we are using * delayed cleanup */ _common_ucx_mem_remove(mem_rec->gmem, mem_rec); - printf("_tlocal_mem_record_cleanup(_common_ucx_mem_remove): gmem = %p mem_rec = %p\n", + DBG_OUT("_tlocal_mem_record_cleanup(_common_ucx_mem_remove): gmem = %p mem_rec = %p\n", (void *)mem_rec->gmem, (void *)mem_rec); for(i = 0; i < mem_rec->gmem->ctx->comm_size; i++) { if (mem_rec->mem->rkeys[i]) { ucp_rkey_destroy(mem_rec->mem->rkeys[i]); - printf("_tlocal_mem_record_cleanup(after ucp_rkey_destroy): rkey_entry = %p\n", + DBG_OUT("_tlocal_mem_record_cleanup(after ucp_rkey_destroy): rkey_entry = %p\n", (void *)mem_rec->mem->rkeys[i]); } } @@ -1108,7 +1146,7 @@ static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls, if (tls->mem_tbl[i]->is_freed) { /* Found a dirty record. Need to clean it first */ _tlocal_mem_record_cleanup(tls->mem_tbl[i]); - printf("_tlocal_add_mem(after _tlocal_mem_record_cleanup): tls = %p mem_tbl_entry = %p\n", + DBG_OUT("_tlocal_add_mem(after _tlocal_mem_record_cleanup): tls = %p mem_tbl_entry = %p\n", (void *)tls, (void *)tls->mem_tbl[i]); break; } @@ -1121,7 +1159,7 @@ static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls, //TODO: error out return NULL; } - printf("_tlocal_add_mem(after _tlocal_tls_memtbl_extend): tls = %p\n", + DBG_OUT("_tlocal_add_mem(after _tlocal_tls_memtbl_extend): tls = %p\n", (void *)tls); } tls->mem_tbl[i]->mem_id = mem->mem_id; @@ -1133,7 +1171,7 @@ static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls, // TODO: act accordingly - cleanup return NULL; } - printf("_tlocal_add_mem(after _tlocal_ctx_search): tls = %p, ctx_id = %d\n", + DBG_OUT("_tlocal_add_mem(after _tlocal_ctx_search): tls = %p, ctx_id = %d\n", (void *)tls, (int)mem->ctx->ctx_id); tls->mem_tbl[i]->mem->worker = ctx_rec->winfo; @@ -1153,7 +1191,7 @@ static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls, // TODO: error handling return NULL; } - printf("_tlocal_add_mem(after _common_ucx_mem_append): mem = %p, mem_tbl_entry = %p\n", + DBG_OUT("_tlocal_add_mem(after _common_ucx_mem_append): mem = %p, mem_tbl_entry = %p\n", (void *)mem, (void *)tls->mem_tbl[i]); return tls->mem_tbl[i]; @@ -1172,7 +1210,7 @@ static int _tlocal_mem_create_rkey(_tlocal_mem_t *mem_rec, ucp_ep_h ep, int targ MCA_COMMON_UCX_VERBOSE(1, "ucp_ep_rkey_unpack failed: %d", status); return OPAL_ERROR; } - printf("_tlocal_mem_create_rkey(after ucp_ep_rkey_unpack): mem_rec = %p ep = %p target = %d\n", + DBG_OUT("_tlocal_mem_create_rkey(after ucp_ep_rkey_unpack): mem_rec = %p ep = %p target = %d\n", (void *)mem_rec, (void *)ep, target); return OPAL_SUCCESS; } @@ -1194,13 +1232,13 @@ static inline int _tlocal_fetch(opal_common_ucx_mem_t *mem, int target, /* Obtain the worker structure */ ctx_rec = _tlocal_ctx_search(tls, mem->ctx->ctx_id); - printf("_tlocal_fetch(after _tlocal_ctx_search): tls = %p ctx_id = %d\n", (void *)tls, (int)mem->ctx->ctx_id); + DBG_OUT("_tlocal_fetch(after _tlocal_ctx_search): tls = %p ctx_id = %d\n", (void *)tls, (int)mem->ctx->ctx_id); if (OPAL_UNLIKELY(NULL == ctx_rec)) { ctx_rec = _tlocal_add_ctx(tls, mem->ctx); if (NULL == ctx_rec) { return OPAL_ERR_OUT_OF_RESOURCE; } - printf("_tlocal_fetch(after _tlocal_add_ctx): tls = %p ctx = %p\n", (void *)tls, (void *)mem->ctx); + DBG_OUT("_tlocal_fetch(after _tlocal_add_ctx): tls = %p ctx = %p\n", (void *)tls, (void *)mem->ctx); } winfo = ctx_rec->winfo; @@ -1210,16 +1248,16 @@ static inline int _tlocal_fetch(opal_common_ucx_mem_t *mem, int target, if (rc != OPAL_SUCCESS) { return rc; } - printf("_tlocal_fetch(after _tlocal_ctx_connect): ctx_rec = %p target = %d\n", (void *)ctx_rec, target); + DBG_OUT("_tlocal_fetch(after _tlocal_ctx_connect): ctx_rec = %p target = %d\n", (void *)ctx_rec, target); } ep = winfo->endpoints[target]; /* Obtain the memory region info */ mem_rec = _tlocal_search_mem(tls, mem->mem_id); - printf("_tlocal_fetch(after _tlocal_search_mem): tls = %p mem_id = %d\n", (void *)tls, (int)mem->mem_id); + DBG_OUT("_tlocal_fetch(after _tlocal_search_mem): tls = %p mem_id = %d\n", (void *)tls, (int)mem->mem_id); if (OPAL_UNLIKELY(mem_rec == NULL)) { mem_rec = _tlocal_add_mem(tls, mem); - printf("_tlocal_fetch(after _tlocal_add_mem): tls = %p mem = %p\n", (void *)tls, (void *)mem); + DBG_OUT("_tlocal_fetch(after _tlocal_add_mem): tls = %p mem = %p\n", (void *)tls, (void *)mem); if (NULL == mem_rec) { return OPAL_ERR_OUT_OF_RESOURCE; } @@ -1233,7 +1271,7 @@ static inline int _tlocal_fetch(opal_common_ucx_mem_t *mem, int target, if (rc) { return rc; } - printf("_tlocal_fetch(after _tlocal_mem_create_rkey): mem_rec = %p ep = %p, target = %d\n", + DBG_OUT("_tlocal_fetch(after _tlocal_mem_create_rkey): mem_rec = %p ep = %p, target = %d\n", (void *)mem_rec, (void *)ep, target); } @@ -1241,7 +1279,7 @@ static inline int _tlocal_fetch(opal_common_ucx_mem_t *mem, int target, *_rkey = rkey = mem_info->rkeys[target]; *_winfo = winfo; - printf("_tlocal_fetch(end): ep = %p, rkey = %p, winfo = %p\n", + DBG_OUT("_tlocal_fetch(end): ep = %p, rkey = %p, winfo = %p\n", (void *)ep, (void *)rkey, (void *)winfo); return OPAL_SUCCESS; @@ -1266,7 +1304,7 @@ opal_common_ucx_mem_putget(opal_common_ucx_mem_t *mem, MCA_COMMON_UCX_VERBOSE(1, "tlocal_fetch failed: %d", rc); return rc; } - printf("opal_common_ucx_mem_putget(after _tlocal_fetch): mem = %p, ep = %p, rkey = %p, winfo = %p\n", + DBG_OUT("opal_common_ucx_mem_putget(after _tlocal_fetch): mem = %p, ep = %p, rkey = %p, winfo = %p\n", (void *)mem, (void *)ep, (void *)rkey, (void *)winfo); /* Perform the operation */ @@ -1279,7 +1317,7 @@ opal_common_ucx_mem_putget(opal_common_ucx_mem_t *mem, opal_mutex_unlock(&winfo->mutex); return OPAL_ERROR; } - printf("opal_common_ucx_mem_putget(after ucp_put_nbi): ep = %p, rkey = %p\n", + DBG_OUT("opal_common_ucx_mem_putget(after ucp_put_nbi): ep = %p, rkey = %p\n", (void *)ep, (void *)rkey); break; case OPAL_COMMON_UCX_GET: @@ -1289,7 +1327,7 @@ opal_common_ucx_mem_putget(opal_common_ucx_mem_t *mem, opal_mutex_unlock(&winfo->mutex); return OPAL_ERROR; } - printf("opal_common_ucx_mem_putget(after ucp_get_nbi): ep = %p, rkey = %p\n", + DBG_OUT("opal_common_ucx_mem_putget(after ucp_get_nbi): ep = %p, rkey = %p\n", (void *)ep, (void *)rkey); break; } @@ -1315,7 +1353,7 @@ int opal_common_ucx_mem_cmpswp(opal_common_ucx_mem_t *mem, MCA_COMMON_UCX_VERBOSE(1, "tlocal_fetch failed: %d", rc); return rc; } - printf("opal_common_ucx_mem_cmpswp(after _tlocal_fetch): mem = %p, ep = %p, rkey = %p, winfo = %p\n", + DBG_OUT("opal_common_ucx_mem_cmpswp(after _tlocal_fetch): mem = %p, ep = %p, rkey = %p, winfo = %p\n", (void *)mem, (void *)ep, (void *)rkey, (void *)winfo); /* Perform the operation */ @@ -1329,7 +1367,7 @@ int opal_common_ucx_mem_cmpswp(opal_common_ucx_mem_t *mem, opal_mutex_unlock(&winfo->mutex); return OPAL_ERROR; } - printf("opal_common_ucx_mem_cmpswp(after opal_common_ucx_atomic_cswap): ep = %p, rkey = %p\n", + DBG_OUT("opal_common_ucx_mem_cmpswp(after opal_common_ucx_atomic_cswap): ep = %p, rkey = %p\n", (void *)ep, (void *)rkey); opal_mutex_unlock(&winfo->mutex); @@ -1353,7 +1391,7 @@ int opal_common_ucx_mem_fetch(opal_common_ucx_mem_t *mem, MCA_COMMON_UCX_VERBOSE(1, "tlocal_fetch failed: %d", rc); return rc; } - printf("opal_common_ucx_mem_fetch(after _tlocal_fetch): mem = %p, ep = %p, rkey = %p, winfo = %p\n", + DBG_OUT("opal_common_ucx_mem_fetch(after _tlocal_fetch): mem = %p, ep = %p, rkey = %p, winfo = %p\n", (void *)mem, (void *)ep, (void *)rkey, (void *)winfo); /* Perform the operation */ @@ -1367,7 +1405,7 @@ int opal_common_ucx_mem_fetch(opal_common_ucx_mem_t *mem, MCA_COMMON_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); return OPAL_ERROR; } - printf("opal_common_ucx_mem_fetch(after opal_common_ucx_atomic_fetch): ep = %p, rkey = %p\n", + DBG_OUT("opal_common_ucx_mem_fetch(after opal_common_ucx_atomic_fetch): ep = %p, rkey = %p\n", (void *)ep, (void *)rkey); opal_mutex_unlock(&winfo->mutex); @@ -1393,7 +1431,7 @@ int opal_common_ucx_mem_post(opal_common_ucx_mem_t *mem, MCA_COMMON_UCX_VERBOSE(1, "tlocal_fetch failed: %d", rc); return rc; } - printf("opal_common_ucx_mem_post(after _tlocal_fetch): mem = %p, ep = %p, rkey = %p, winfo = %p\n", + DBG_OUT("opal_common_ucx_mem_post(after _tlocal_fetch): mem = %p, ep = %p, rkey = %p, winfo = %p\n", (void *)mem, (void *)ep, (void *)rkey, (void *)winfo); /* Perform the operation */ @@ -1405,7 +1443,7 @@ int opal_common_ucx_mem_post(opal_common_ucx_mem_t *mem, MCA_COMMON_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); return OPAL_ERROR; } - printf("opal_common_ucx_mem_post(after ucp_atomic_post): ep = %p, rkey = %p\n", (void *)ep, (void *)rkey); + DBG_OUT("opal_common_ucx_mem_post(after ucp_atomic_post): ep = %p, rkey = %p\n", (void *)ep, (void *)rkey); opal_mutex_unlock(&winfo->mutex); return OPAL_SUCCESS; @@ -1420,7 +1458,7 @@ opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, opal_common_ucx_ctx_t *ctx = mem->ctx; int rc = OPAL_SUCCESS; - printf("opal_common_ucx_mem_flush: mem = %p, target = %d\n", (void *)mem, target); + DBG_OUT("opal_common_ucx_mem_flush: mem = %p, target = %d\n", (void *)mem, target); opal_mutex_lock(&ctx->mutex); OPAL_LIST_FOREACH(item, &ctx->workers, _worker_list_item_t) { @@ -1434,7 +1472,7 @@ opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, MCA_COMMON_UCX_VERBOSE(1, "opal_common_ucx_worker_flush failed: %d", rc); return OPAL_ERROR; } - printf("opal_common_ucx_mem_flush(after opal_common_ucx_worker_flush): worker = %p\n", + DBG_OUT("opal_common_ucx_mem_flush(after opal_common_ucx_worker_flush): worker = %p\n", (void *)item->ptr->winfo->worker); opal_mutex_unlock(&item->ptr->winfo->mutex); break; @@ -1449,7 +1487,7 @@ opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, MCA_COMMON_UCX_VERBOSE(1, "opal_common_ucx_ep_flush failed: %d", rc); return OPAL_ERROR; } - printf("opal_common_ucx_mem_flush(after opal_common_ucx_worker_flush): ep = %p worker = %p\n", + DBG_OUT("opal_common_ucx_mem_flush(after opal_common_ucx_worker_flush): ep = %p worker = %p\n", (void *)item->ptr->winfo->endpoints[target], (void *)item->ptr->winfo->worker); opal_mutex_unlock(&item->ptr->winfo->mutex); @@ -1466,7 +1504,7 @@ int opal_common_ucx_workers_progress(opal_common_ucx_wpool_t *wpool) { // TODO static int enter = 0; if (enter == 0) { - printf("opal_common_ucx_workres_progress: wpool = %p\n", (void *)wpool); + DBG_OUT("opal_common_ucx_workres_progress: wpool = %p\n", (void *)wpool); } enter++; From a0755fea8711ab55b235b582d33314c1f81e5505 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 16:29:10 -0800 Subject: [PATCH 26/59] fix dbg out --- opal/mca/common/ucx/common_ucx.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index e974e47fd4c..b10f64d5855 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -592,7 +592,7 @@ int opal_common_ucx_ctx_create(opal_common_ucx_wpool_t *wpool, int comm_size, } (*ctx_ptr) = ctx; - DBG_OUT("opal_common_ucx_wpool_create: wpool = %p, (*ctx_ptr) = %p\n", (void *)wpool, (void *)(*ctx_ptr)); + DBG_OUT("opal_common_ucx_ctx_create: wpool = %p, (*ctx_ptr) = %p\n", (void *)wpool, (void *)(*ctx_ptr)); return ret; error: @@ -944,7 +944,7 @@ _tlocal_ctx_search(_tlocal_table_t *tls, int ctx_id) return tls->ctx_tbl[i]; } } - DBG_OUT("_tlocal_tls_memtbl_extend(end): tls = %p ctx_id = %d\n", (void *)tls, ctx_id); + DBG_OUT("_tlocal_ctx_search: tls = %p ctx_id = %d\n", (void *)tls, ctx_id); return NULL; } @@ -1228,11 +1228,17 @@ static inline int _tlocal_fetch(opal_common_ucx_mem_t *mem, int target, ucp_rkey_h rkey; int rc = OPAL_SUCCESS; + DBG_OUT("_tlocal_fetch: starttls \n"); + tls = _tlocal_get_tls(mem->ctx->wpool); + DBG_OUT("_tlocal_fetch: tls = %p\n",(void*)tls); + /* Obtain the worker structure */ ctx_rec = _tlocal_ctx_search(tls, mem->ctx->ctx_id); - DBG_OUT("_tlocal_fetch(after _tlocal_ctx_search): tls = %p ctx_id = %d\n", (void *)tls, (int)mem->ctx->ctx_id); + + DBG_OUT("_tlocal_fetch(after _tlocal_ctx_search): ctx_id = %d, ctx_rec=%p\n", + (int)mem->ctx->ctx_id, ctx_rec); if (OPAL_UNLIKELY(NULL == ctx_rec)) { ctx_rec = _tlocal_add_ctx(tls, mem->ctx); if (NULL == ctx_rec) { @@ -1241,6 +1247,7 @@ static inline int _tlocal_fetch(opal_common_ucx_mem_t *mem, int target, DBG_OUT("_tlocal_fetch(after _tlocal_add_ctx): tls = %p ctx = %p\n", (void *)tls, (void *)mem->ctx); } winfo = ctx_rec->winfo; + DBG_OUT("_tlocal_fetch: winfo = %p ctx=%p\n", (void *)winfo, (void *)mem->ctx); /* Obtain the endpoint */ if (OPAL_UNLIKELY(NULL == winfo->endpoints[target])) { @@ -1251,10 +1258,11 @@ static inline int _tlocal_fetch(opal_common_ucx_mem_t *mem, int target, DBG_OUT("_tlocal_fetch(after _tlocal_ctx_connect): ctx_rec = %p target = %d\n", (void *)ctx_rec, target); } ep = winfo->endpoints[target]; + DBG_OUT("_tlocal_fetch: ep = %p\n", (void *)ep); /* Obtain the memory region info */ mem_rec = _tlocal_search_mem(tls, mem->mem_id); - DBG_OUT("_tlocal_fetch(after _tlocal_search_mem): tls = %p mem_id = %d\n", (void *)tls, (int)mem->mem_id); + DBG_OUT("_tlocal_fetch: tls = %p mem_rec = %p mem_id = %d\n", (void *)tls, (void *)mem_rec, (int)mem->mem_id); if (OPAL_UNLIKELY(mem_rec == NULL)) { mem_rec = _tlocal_add_mem(tls, mem); DBG_OUT("_tlocal_fetch(after _tlocal_add_mem): tls = %p mem = %p\n", (void *)tls, (void *)mem); @@ -1263,6 +1271,7 @@ static inline int _tlocal_fetch(opal_common_ucx_mem_t *mem, int target, } } mem_info = mem_rec->mem; + DBG_OUT("_tlocal_fetch: mem_info = %p\n", (void *)mem_info); /* Obtain the rkey */ if (OPAL_UNLIKELY(NULL == mem_info->rkeys[target])) { @@ -1271,9 +1280,9 @@ static inline int _tlocal_fetch(opal_common_ucx_mem_t *mem, int target, if (rc) { return rc; } - DBG_OUT("_tlocal_fetch(after _tlocal_mem_create_rkey): mem_rec = %p ep = %p, target = %d\n", - (void *)mem_rec, (void *)ep, target); + DBG_OUT("_tlocal_fetch: creating rkey ...\n"); } + DBG_OUT("_tlocal_fetch: rkey = %p\n", (void *)rkey); *_ep = ep; *_rkey = rkey = mem_info->rkeys[target]; From c91048371497d5812248703fb2cb5fcf2319f38e Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 17:35:15 -0800 Subject: [PATCH 27/59] some fixes --- opal/mca/common/ucx/common_ucx.c | 95 +++++++++++++++++++++++++++----- 1 file changed, 81 insertions(+), 14 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index b10f64d5855..b985e292a0a 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -122,14 +122,19 @@ void init_tls_dbg(void) static int _tlocal_tls_ctxtbl_extend(_tlocal_table_t *tbl, size_t append); static int _tlocal_tls_memtbl_extend(_tlocal_table_t *tbl, size_t append); static _tlocal_table_t* _common_ucx_tls_init(opal_common_ucx_wpool_t *wpool); +static void _common_ucx_tls_cleanup(_tlocal_table_t *tls); static inline _tlocal_ctx_t *_tlocal_ctx_search(_tlocal_table_t *tls, int ctx_id); -static int _tlocal_cleanup_ctx_record(_tlocal_ctx_t *ctx_rec); +static int _tlocal_ctx_record_cleanup(_tlocal_ctx_t *ctx_rec); static _tlocal_ctx_t *_tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx); static int _tlocal_ctx_connect(_tlocal_ctx_t *ctx, int target); static int _tlocal_ctx_release(opal_common_ucx_ctx_t *ctx); static inline _tlocal_mem_t *_tlocal_search_mem(_tlocal_table_t *tls, int mem_id); static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls, opal_common_ucx_mem_t *mem); static int _tlocal_mem_create_rkey(_tlocal_mem_t *mem_rec, ucp_ep_h ep, int target); +// TOD: Return the error from it +static void _tlocal_mem_record_cleanup(_tlocal_mem_t *mem_rec); + + /***********************************************************************/ @@ -336,8 +341,26 @@ OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, s static inline void _cleanup_tlocal(void *arg) { - // 1. Cleanup all rkeys in the window table - // 2. Return all workers into the idle pool + _tlocal_table_t *item = NULL, *next; + _tlocal_table_t *tls = (_tlocal_table_t *)arg; + opal_common_ucx_wpool_t *wpool = NULL; + + if (NULL == tls) { + return; + } + + wpool = tls->wpool; + /* 1. Remove us from tls_list */ + tls->wpool = wpool; + opal_mutex_lock(&wpool->mutex); + OPAL_LIST_FOREACH_SAFE(item, next, &wpool->tls_list, _tlocal_table_t) { + if (item == tls) { + opal_list_remove_item(&wpool->tls_list, &item->super); + _common_ucx_tls_cleanup(tls); + break; + } + } + opal_mutex_unlock(&wpool->mutex); } static @@ -412,10 +435,6 @@ _worker_info_t* _wpool_remove_from_idle(opal_common_ucx_wpool_t *wpool) return wkr; } - - - - OPAL_DECLSPEC opal_common_ucx_wpool_t * opal_common_ucx_wpool_allocate(void) { @@ -534,14 +553,23 @@ int opal_common_ucx_wpool_init(opal_common_ucx_wpool_t *wpool, OPAL_DECLSPEC void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) { + _tlocal_table_t *tls_item = NULL, *tls_next; + wpool->refcnt--; if (wpool->refcnt > 0) { DBG_OUT("opal_common_ucx_wpool_finalize: wpool = %p\n", (void *)wpool); return; } - /* Go over the list, free idle list items */ + pthread_key_delete(_tlocal_key); + opal_mutex_lock(&wpool->mutex); + OPAL_LIST_FOREACH_SAFE(tls_item, tls_next, &wpool->tls_list, _tlocal_table_t) { + opal_list_remove_item(&wpool->tls_list, &tls_item->super); + _common_ucx_tls_cleanup(tls_item); + } + + /* Go over the list, free idle list items */ if (!opal_list_is_empty(&wpool->idle_workers)) { _idle_list_item_t *item, *next; OPAL_LIST_FOREACH_SAFE(item, next, &wpool->idle_workers, _idle_list_item_t) { @@ -833,7 +861,6 @@ _common_ucx_mem_remove(opal_common_ucx_mem_t *mem, _tlocal_mem_t *mem_rec) * we can safely release communication context structure */ _common_ucx_mem_free(mem); } - DBG_OUT("_common_ucx_mem_remove(end): mem = %p mem_rec = %p\n", (void *)mem, (void *)mem_rec); return; } @@ -842,8 +869,13 @@ _common_ucx_mem_remove(opal_common_ucx_mem_t *mem, _tlocal_mem_t *mem_rec) // TODO: don't want to inline this function static _tlocal_table_t* _common_ucx_tls_init(opal_common_ucx_wpool_t *wpool) { - _tlocal_table_t *tls = NULL; - tls = OBJ_NEW(_tlocal_table_t); + _tlocal_table_t *tls = OBJ_NEW(_tlocal_table_t); + + if (tls == NULL) { + // return OPAL_ERR_OUT_OF_RESOURCE + return NULL; + } + memset(tls, 0, sizeof(*tls)); /* Add this TLS to the global wpool structure for future @@ -877,6 +909,41 @@ _tlocal_get_tls(opal_common_ucx_wpool_t *wpool){ return tls; } +_worker_list_item_t *item = NULL, *next; + +// TODO: don't want to inline this function +static void _common_ucx_tls_cleanup(_tlocal_table_t *tls) +{ + size_t i, size; + + // Cleanup memory table + size = tls->mem_tbl_size; + for (i = 0; i < size; i++) { + + if (!tls->mem_tbl[i]->mem_id){ + continue; + } + _tlocal_mem_record_cleanup(tls->mem_tbl[i]); + free(tls->mem_tbl[i]); + } + + // Cleanup ctx table + size = tls->ctx_tbl_size; + for (i = 0; i < size; i++) { + _tlocal_ctx_record_cleanup(tls->ctx_tbl[i]); + free(tls->ctx_tbl[i]); + } + + pthread_setspecific(_tlocal_key, NULL); + DBG_OUT("_common_ucx_tls_cleanup(end): tls = %p\n", (void *)tls); + + OBJ_RELEASE(tls); + + return; +} + + + static int _tlocal_tls_get_worker(_tlocal_table_t *tls, _worker_info_t **_winfo) { @@ -949,7 +1016,7 @@ _tlocal_ctx_search(_tlocal_table_t *tls, int ctx_id) } static int -_tlocal_cleanup_ctx_record(_tlocal_ctx_t *ctx_rec) +_tlocal_ctx_record_cleanup(_tlocal_ctx_t *ctx_rec) { int rc; if (!ctx_rec->is_freed) { @@ -987,7 +1054,7 @@ _tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx) } if (tls->ctx_tbl[i]->is_freed ) { /* Found dirty record, need to clean first */ - _tlocal_cleanup_ctx_record(tls->ctx_tbl[i]); + _tlocal_ctx_record_cleanup(tls->ctx_tbl[i]); break; } } @@ -1238,7 +1305,7 @@ static inline int _tlocal_fetch(opal_common_ucx_mem_t *mem, int target, ctx_rec = _tlocal_ctx_search(tls, mem->ctx->ctx_id); DBG_OUT("_tlocal_fetch(after _tlocal_ctx_search): ctx_id = %d, ctx_rec=%p\n", - (int)mem->ctx->ctx_id, ctx_rec); + (int)mem->ctx->ctx_id, (void *)ctx_rec); if (OPAL_UNLIKELY(NULL == ctx_rec)) { ctx_rec = _tlocal_add_ctx(tls, mem->ctx); if (NULL == ctx_rec) { From b57c86ffb32e2499fd22444dd2d6408d245ff2cc Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 17:38:29 -0800 Subject: [PATCH 28/59] fix --- opal/mca/common/ucx/common_ucx.c | 1 - 1 file changed, 1 deletion(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index b985e292a0a..380edd3b5e2 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -1349,7 +1349,6 @@ static inline int _tlocal_fetch(opal_common_ucx_mem_t *mem, int target, } DBG_OUT("_tlocal_fetch: creating rkey ...\n"); } - DBG_OUT("_tlocal_fetch: rkey = %p\n", (void *)rkey); *_ep = ep; *_rkey = rkey = mem_info->rkeys[target]; From bacf3035cc8b4d251b5b7992e5e453c41e36bbe5 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 17:46:46 -0800 Subject: [PATCH 29/59] output --- opal/mca/common/ucx/common_ucx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index b985e292a0a..b61b9f5ad88 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -345,6 +345,8 @@ static inline void _cleanup_tlocal(void *arg) _tlocal_table_t *tls = (_tlocal_table_t *)arg; opal_common_ucx_wpool_t *wpool = NULL; + DBG_OUT("_cleanup_tlocal: start\n"); + if (NULL == tls) { return; } From cefad21094af593e109994750b016567455a5ef1 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 17:53:50 -0800 Subject: [PATCH 30/59] fixes --- opal/mca/common/ucx/common_ucx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index d2884d52d23..6b8c4c35349 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -878,7 +878,10 @@ static _tlocal_table_t* _common_ucx_tls_init(opal_common_ucx_wpool_t *wpool) return NULL; } - memset(tls, 0, sizeof(*tls)); + tls->ctx_tbl = NULL; + tls->ctx_tbl_size = 0; + tls->mem_tbl = NULL; + tls->mem_tbl_size = 0; /* Add this TLS to the global wpool structure for future * cleanup purposes */ From 13caecf27b08051362c290ba143f5f0e844000a5 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 18:02:31 -0800 Subject: [PATCH 31/59] fixes --- ompi/mca/osc/ucx/osc_ucx_component.c | 16 +++++++++------- opal/mca/common/ucx/common_ucx.c | 3 +++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index 391cee773c1..326985c49c7 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -482,7 +482,7 @@ int ompi_osc_ucx_win_attach(struct ompi_win_t *win, void *base, size_t len) { (OMPI_OSC_UCX_ATTACH_MAX - (insert_index + 1)) * sizeof(ompi_osc_dynamic_win_info_t)); } else { insert_index = 0; - } + }*/ /* ret = mem_map(&base, len, &(module->local_dynamic_win_info[insert_index].memh), module, MPI_WIN_FLAVOR_CREATE); @@ -553,7 +553,7 @@ int ompi_osc_ucx_win_detach(struct ompi_win_t *win, const void *base) { int ompi_osc_ucx_free(struct ompi_win_t *win) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; - int i, ret; + int ret; assert(module->global_ops_num == 0); assert(module->lock_count == 0); @@ -585,22 +585,24 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) { free(module->win_info_array); free(module->state_info_array); - free(module->per_target_ops_nums); - if ((module->flavor == MPI_WIN_FLAVOR_ALLOCATE || module->flavor == MPI_WIN_FLAVOR_CREATE) && module->size > 0) { ucp_mem_unmap(mca_osc_ucx_component.ucp_context, module->memh); } ucp_mem_unmap(mca_osc_ucx_component.ucp_context, module->state_memh); + + return ret; + */ + free(module->per_target_ops_nums); + + opal_common_ucx_wpool_finalize(mca_osc_ucx_component.wpool); + if (module->disp_units) free(module->disp_units); ompi_comm_free(&module->comm); free(module); ompi_osc_ucx_unregister_progress(); - return ret; - */ - opal_common_ucx_wpool_finalize(mca_osc_ucx_component.wpool); return OMPI_SUCCESS; } diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 6b8c4c35349..7ab7e7b4f5b 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -557,6 +557,8 @@ void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) { _tlocal_table_t *tls_item = NULL, *tls_next; + DBG_OUT("opal_common_ucx_wpool_finalize(start): wpool = %p\n", (void *)wpool); + wpool->refcnt--; if (wpool->refcnt > 0) { DBG_OUT("opal_common_ucx_wpool_finalize: wpool = %p\n", (void *)wpool); @@ -569,6 +571,7 @@ void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) OPAL_LIST_FOREACH_SAFE(tls_item, tls_next, &wpool->tls_list, _tlocal_table_t) { opal_list_remove_item(&wpool->tls_list, &tls_item->super); _common_ucx_tls_cleanup(tls_item); + DBG_OUT("opal_common_ucx_wpool_finalize: cleanup wpool = %p\n", (void *)wpool); } /* Go over the list, free idle list items */ From 32a1a2690beac0ef7e7767d1db40941690dc42e5 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 18:08:34 -0800 Subject: [PATCH 32/59] fix on atomic fadd --- opal/mca/common/ucx/common_ucx.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 7ab7e7b4f5b..e49b7914316 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -608,7 +608,8 @@ int opal_common_ucx_ctx_create(opal_common_ucx_wpool_t *wpool, int comm_size, opal_common_ucx_ctx_t *ctx = calloc(1, sizeof(*ctx)); int ret = OPAL_SUCCESS; - ctx->ctx_id = OPAL_ATOMIC_ADD_FETCH32(&ctx->ctx_id, 1); + OPAL_ATOMIC_ADD_FETCH32(&ctx->ctx_id, 1); + DBG_OUT("ctx_create: ctx_id = %d\n", (int)ctx->ctx_id); OBJ_CONSTRUCT(&ctx->mutex, opal_mutex_t); OBJ_CONSTRUCT(&ctx->workers, opal_list_t); @@ -765,7 +766,9 @@ int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int comm_size, ucs_status_t status; int ret = OPAL_SUCCESS; - mem->mem_id = OPAL_ATOMIC_ADD_FETCH32(&mem->mem_id, 1); + OPAL_ATOMIC_ADD_FETCH32(&mem->mem_id, 1); + DBG_OUT("mem_create: mem_id = %d\n", (int)mem->mem_id); + OBJ_CONSTRUCT(&mem->mutex, opal_mutex_t); OBJ_CONSTRUCT(&mem->registrations, opal_list_t); mem->ctx = ctx; From c27fb704ae497e360d984a64e1df15620e94405c Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 18:16:31 -0800 Subject: [PATCH 33/59] fixes --- opal/mca/common/ucx/common_ucx.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index e49b7914316..855d75aa907 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -1030,7 +1030,7 @@ static int _tlocal_ctx_record_cleanup(_tlocal_ctx_t *ctx_rec) { int rc; - if (!ctx_rec->is_freed) { + if (ctx_rec->is_freed) { return OPAL_SUCCESS; } /* Remove myself from the communication context structure @@ -1157,7 +1157,7 @@ static int _tlocal_ctx_release(opal_common_ucx_ctx_t *ctx) (void *)tls->wpool, (void *)ctx_rec->winfo); ctx_rec->ctx_id = 0; - ctx_rec->is_freed = 0; + ctx_rec->is_freed = 1; ctx_rec->gctx = NULL; ctx_rec->winfo = NULL; @@ -1183,7 +1183,9 @@ static void _tlocal_mem_record_cleanup(_tlocal_mem_t *mem_rec) { size_t i; - if (!mem_rec->is_freed) { + DBG_OUT("_tlocal_mem_record_cleanup: record=%p, is_freed = %d\n", + (void *)mem_rec, mem_rec->is_freed); + if (mem_rec->is_freed) { return; } /* Remove myself from the memory context structure @@ -1205,6 +1207,7 @@ _tlocal_mem_record_cleanup(_tlocal_mem_t *mem_rec) free(mem_rec->mem); memset(mem_rec, 0, sizeof(*mem_rec)); + mem_rec->is_freed = 1; } From 1a3b559f074544bc79a5308c7ad97265b5e60d1d Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 18:25:16 -0800 Subject: [PATCH 34/59] fix --- opal/mca/common/ucx/common_ucx.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 855d75aa907..38fcc85fa82 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -30,6 +30,7 @@ typedef struct { typedef struct { int ctx_id; + // TODO: make sure that this is being set by external thread int is_freed; opal_common_ucx_ctx_t *gctx; _worker_info_t *winfo; @@ -930,7 +931,6 @@ static void _common_ucx_tls_cleanup(_tlocal_table_t *tls) // Cleanup memory table size = tls->mem_tbl_size; for (i = 0; i < size; i++) { - if (!tls->mem_tbl[i]->mem_id){ continue; } @@ -941,6 +941,9 @@ static void _common_ucx_tls_cleanup(_tlocal_table_t *tls) // Cleanup ctx table size = tls->ctx_tbl_size; for (i = 0; i < size; i++) { + if (!tls->ctx_tbl[i]->ctx_id){ + continue; + } _tlocal_ctx_record_cleanup(tls->ctx_tbl[i]); free(tls->ctx_tbl[i]); } @@ -1030,7 +1033,7 @@ static int _tlocal_ctx_record_cleanup(_tlocal_ctx_t *ctx_rec) { int rc; - if (ctx_rec->is_freed) { + if (0 == ctx_rec->ctx_id) { return OPAL_SUCCESS; } /* Remove myself from the communication context structure @@ -1157,7 +1160,7 @@ static int _tlocal_ctx_release(opal_common_ucx_ctx_t *ctx) (void *)tls->wpool, (void *)ctx_rec->winfo); ctx_rec->ctx_id = 0; - ctx_rec->is_freed = 1; + ctx_rec->is_freed = 0; ctx_rec->gctx = NULL; ctx_rec->winfo = NULL; @@ -1207,7 +1210,6 @@ _tlocal_mem_record_cleanup(_tlocal_mem_t *mem_rec) free(mem_rec->mem); memset(mem_rec, 0, sizeof(*mem_rec)); - mem_rec->is_freed = 1; } From f0c2e15adbd1288e56f20480173c9b582a00fb73 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 18:29:45 -0800 Subject: [PATCH 35/59] fix --- opal/mca/common/ucx/common_ucx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 38fcc85fa82..6b1798986fd 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -395,7 +395,9 @@ int _wpool_add_to_idle(opal_common_ucx_wpool_t *wpool, _worker_info_t *winfo) if(winfo->comm_size != 0) { size_t i; for (i = 0; i < winfo->comm_size; i++) { - ucp_ep_destroy(winfo->endpoints[i]); + if (NULL != winfo->endpoints[i]){ + ucp_ep_destroy(winfo->endpoints[i]); + } } free(winfo->endpoints); winfo->endpoints = NULL; From 5410d82efbe138533b85503c875c7ef199919278 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 18:33:50 -0800 Subject: [PATCH 36/59] FIX --- opal/mca/common/ucx/common_ucx.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 6b1798986fd..06db99aa5e9 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -573,9 +573,14 @@ void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) opal_mutex_lock(&wpool->mutex); OPAL_LIST_FOREACH_SAFE(tls_item, tls_next, &wpool->tls_list, _tlocal_table_t) { opal_list_remove_item(&wpool->tls_list, &tls_item->super); + opal_mutex_unlock(&wpool->mutex); + _common_ucx_tls_cleanup(tls_item); + + opal_mutex_lock(&wpool->mutex); DBG_OUT("opal_common_ucx_wpool_finalize: cleanup wpool = %p\n", (void *)wpool); } + opal_mutex_unlock(&wpool->mutex); /* Go over the list, free idle list items */ if (!opal_list_is_empty(&wpool->idle_workers)) { From b57a3b7b8ceb97cdb63cf42459a6fc8e17e301fd Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 18:37:14 -0800 Subject: [PATCH 37/59] fix --- opal/mca/common/ucx/common_ucx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 06db99aa5e9..0723e25c07f 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -27,7 +27,6 @@ typedef struct { size_t comm_size; } _worker_info_t; - typedef struct { int ctx_id; // TODO: make sure that this is being set by external thread @@ -591,7 +590,7 @@ void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) curr_worker = item->ptr; OBJ_DESTRUCT(&curr_worker->mutex); ucp_worker_destroy(curr_worker->worker); - OBJ_RELEASE(curr_worker); + free(curr_worker); OBJ_RELEASE(item); } } From c80b2ff08a75a798efa935448427d7c309103e07 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 18:40:58 -0800 Subject: [PATCH 38/59] fix --- opal/mca/common/ucx/common_ucx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 0723e25c07f..e957bbc57e7 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -581,6 +581,7 @@ void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) } opal_mutex_unlock(&wpool->mutex); + opal_mutex_lock(&wpool->mutex); /* Go over the list, free idle list items */ if (!opal_list_is_empty(&wpool->idle_workers)) { _idle_list_item_t *item, *next; From 30062416678b137b98aa90e563139857dbc666a7 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 16 Nov 2018 18:44:52 -0800 Subject: [PATCH 39/59] fix --- opal/mca/common/ucx/common_ucx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index e957bbc57e7..76790cd4040 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -581,6 +581,10 @@ void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) } opal_mutex_unlock(&wpool->mutex); + /* Release the address here. recv worker will be released + * below along with other idle workers */ + ucp_worker_release_address(wpool->recv_worker, wpool->recv_waddr); + opal_mutex_lock(&wpool->mutex); /* Go over the list, free idle list items */ if (!opal_list_is_empty(&wpool->idle_workers)) { @@ -600,8 +604,6 @@ void opal_common_ucx_wpool_finalize(opal_common_ucx_wpool_t *wpool) OBJ_DESTRUCT(&wpool->idle_workers); OBJ_DESTRUCT(&wpool->tls_list); OBJ_DESTRUCT(&wpool->mutex); - ucp_worker_release_address(wpool->recv_worker, wpool->recv_waddr); - ucp_worker_destroy(wpool->recv_worker); ucp_cleanup(wpool->ucp_ctx); DBG_OUT("opal_common_ucx_wpool_finalize: wpool = %p\n", (void *)wpool); return; From 228b03716bb6b80957b9057d772801f5aba0d8a4 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 13:44:49 -0800 Subject: [PATCH 40/59] add dbg_output to osc code --- ompi/mca/osc/ucx/osc_ucx_passive_target.c | 6 ++++ opal/mca/common/ucx/common_ucx.c | 34 +---------------------- opal/mca/common/ucx/common_ucx.h | 31 +++++++++++++++++++++ 3 files changed, 38 insertions(+), 33 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_passive_target.c b/ompi/mca/osc/ucx/osc_ucx_passive_target.c index 4b05d1e7528..9a242f7f728 100644 --- a/ompi/mca/osc/ucx/osc_ucx_passive_target.c +++ b/ompi/mca/osc/ucx/osc_ucx_passive_target.c @@ -268,6 +268,8 @@ int ompi_osc_ucx_unlock_all(struct ompi_win_t *win) { int comm_size = ompi_comm_size(module->comm); int ret = OMPI_SUCCESS; + DBG_OUT("ompi_osc_ucx_unlock_all: start, mem = %p\n", (void *)module->mem); + if (module->epoch_type.access != PASSIVE_ALL_EPOCH) { return OMPI_ERR_RMA_SYNC; } @@ -280,6 +282,8 @@ int ompi_osc_ucx_unlock_all(struct ompi_win_t *win) { // return ret; // } + DBG_OUT("ompi_osc_ucx_unlock_all: after flush, mem = %p\n", (void *)module->mem); + module->global_ops_num = 0; memset(module->per_target_ops_nums, 0, sizeof(int) * comm_size); @@ -292,6 +296,8 @@ int ompi_osc_ucx_unlock_all(struct ompi_win_t *win) { module->epoch_type.access = NONE_EPOCH; + DBG_OUT("ompi_osc_ucx_unlock_all: end, mem = %p\n", (void *)module->mem); + return ret; } diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 76790cd4040..61744206e43 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -16,7 +16,7 @@ #include "opal/memoryhooks/memory.h" #include -#include + /***********************************************************************/ @@ -87,38 +87,6 @@ OBJ_CLASS_INSTANCE(_tlocal_table_t, opal_list_item_t, NULL, NULL); static pthread_key_t _tlocal_key = {0}; -#define FDBG -#ifdef FDBG -__thread FILE *tls_pf = NULL; -__thread int initialized = 0; - -#include - -void init_tls_dbg(void) -{ - if( !initialized ) { - int tid = syscall(__NR_gettid); - char hname[128]; - gethostname(hname, 127); - char fname[128]; - - sprintf(fname, "%s.%d.log", hname, tid); - tls_pf = fopen(fname, "w"); - initialized = 1; - } -} - -#define DBG_OUT(...) \ -{ \ - init_tls_dbg(); \ - fprintf(tls_pf, __VA_ARGS__); \ -} - -#else -#define DBG_OUT(...) -#endif - - static int _tlocal_tls_ctxtbl_extend(_tlocal_table_t *tbl, size_t append); static int _tlocal_tls_memtbl_extend(_tlocal_table_t *tbl, size_t append); static _tlocal_table_t* _common_ucx_tls_init(opal_common_ucx_wpool_t *wpool); diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index b586292fc8e..5edecb89149 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -18,6 +18,7 @@ #include #include +#include #include "opal/mca/mca.h" #include "opal/util/output.h" @@ -191,7 +192,37 @@ OPAL_DECLSPEC int opal_common_ucx_mem_post(opal_common_ucx_mem_t *mem, size_t len, uint64_t rem_addr); +#define FDBG +#ifdef FDBG +static __thread FILE *tls_pf = NULL; +static __thread int initialized = 0; +#include +#include + +static inline void init_tls_dbg(void) +{ + if( !initialized ) { + int tid = syscall(__NR_gettid); + char hname[128]; + gethostname(hname, 127); + char fname[128]; + + sprintf(fname, "%s.%d.log", hname, tid); + tls_pf = fopen(fname, "w"); + initialized = 1; + } +} + +#define DBG_OUT(...) \ +{ \ + init_tls_dbg(); \ + fprintf(tls_pf, __VA_ARGS__); \ +} + +#else +#define DBG_OUT(...) +#endif OPAL_DECLSPEC void opal_common_ucx_mca_register(void); OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void); From c46af4f28a69772f51793dea257331cd1acc166e Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 13:58:15 -0800 Subject: [PATCH 41/59] fixes --- opal/mca/common/ucx/common_ucx.c | 2 ++ opal/mca/common/ucx/common_ucx.h | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 61744206e43..cdac47437e4 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -103,6 +103,8 @@ static int _tlocal_mem_create_rkey(_tlocal_mem_t *mem_rec, ucp_ep_h ep, int targ static void _tlocal_mem_record_cleanup(_tlocal_mem_t *mem_rec); +__thread FILE *tls_pf = NULL; +__thread int initialized = 0; /***********************************************************************/ diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 5edecb89149..961899a22d4 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -194,8 +194,8 @@ OPAL_DECLSPEC int opal_common_ucx_mem_post(opal_common_ucx_mem_t *mem, #define FDBG #ifdef FDBG -static __thread FILE *tls_pf = NULL; -static __thread int initialized = 0; +extern __thread FILE *tls_pf; +extern __thread int initialized; #include #include From f264ad38b082e1245af9eeaa070430be2d71cdae Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 14:06:31 -0800 Subject: [PATCH 42/59] add timestep --- opal/mca/common/ucx/common_ucx.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 961899a22d4..5d01c9900bc 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -199,6 +199,8 @@ extern __thread int initialized; #include #include +#include +#include static inline void init_tls_dbg(void) { @@ -207,8 +209,10 @@ static inline void init_tls_dbg(void) char hname[128]; gethostname(hname, 127); char fname[128]; + struct timeval start; - sprintf(fname, "%s.%d.log", hname, tid); + gettimeofday(&start, NULL); + sprintf(fname, "[%ld] %s.%d.log", (start.tv_sec * 1000000 + start.tv_usec), hname, tid); tls_pf = fopen(fname, "w"); initialized = 1; } From d251dac9bc4c8ed3bb26e6fa344083c7ba1f4d3b Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 14:11:09 -0800 Subject: [PATCH 43/59] fix on timestep --- opal/mca/common/ucx/common_ucx.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 5d01c9900bc..d75415f103b 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -209,10 +209,8 @@ static inline void init_tls_dbg(void) char hname[128]; gethostname(hname, 127); char fname[128]; - struct timeval start; - gettimeofday(&start, NULL); - sprintf(fname, "[%ld] %s.%d.log", (start.tv_sec * 1000000 + start.tv_usec), hname, tid); + sprintf(fname, " %s.%d.log", hname, tid); tls_pf = fopen(fname, "w"); initialized = 1; } @@ -220,7 +218,10 @@ static inline void init_tls_dbg(void) #define DBG_OUT(...) \ { \ + struct timeval start_; \ + gettimeofday(&start_, NULL); \ init_tls_dbg(); \ + fprintf(tls_pf, "[%ld] ", (start_.tv_sec * 1000000 + start_.tv_usec));\ fprintf(tls_pf, __VA_ARGS__); \ } From 7e1d6e0ce6771a2647bbea551d2cbe3e6bf925ee Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 14:12:47 -0800 Subject: [PATCH 44/59] fix --- opal/mca/common/ucx/common_ucx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index d75415f103b..929130d9868 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -210,7 +210,7 @@ static inline void init_tls_dbg(void) gethostname(hname, 127); char fname[128]; - sprintf(fname, " %s.%d.log", hname, tid); + sprintf(fname, "%s.%d.log", hname, tid); tls_pf = fopen(fname, "w"); initialized = 1; } From 08038805e0b59636dbc9a0f5103e216b18f62e1d Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 14:19:47 -0800 Subject: [PATCH 45/59] fixes --- opal/mca/common/ucx/common_ucx.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 929130d9868..89966f08273 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -218,10 +218,16 @@ static inline void init_tls_dbg(void) #define DBG_OUT(...) \ { \ - struct timeval start_; \ - gettimeofday(&start_, NULL); \ + struct timeval start_; \ + time_t nowtime_; \ + struct tm *nowtm_; \ + char tmbuf_[64]; \ + gettimeofday(&start_, NULL); \ + nowtime_ = start_.tv_sec; \ + nowtm_ = localtime(&nowtime_); \ + strftime(tmbuf_, sizeof(tmbuf_), "%H:%M:%S", nowtm_); \ init_tls_dbg(); \ - fprintf(tls_pf, "[%ld] ", (start_.tv_sec * 1000000 + start_.tv_usec));\ + fprintf(tls_pf, "[%s.%06ld] ", tmbuf_, start_.tv_usec);\ fprintf(tls_pf, __VA_ARGS__); \ } From 6bc97627aa1079ce77e96ae6a6d3b6195a17198f Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 14:32:07 -0800 Subject: [PATCH 46/59] modify on win_free --- ompi/mca/osc/ucx/osc_ucx_component.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index 326985c49c7..01ad98aa8c6 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -555,6 +555,8 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; int ret; + DBG_OUT("ompi_osc_ucx_free: start, mem = %p\n", (void *)module->mem); + assert(module->global_ops_num == 0); assert(module->lock_count == 0); assert(opal_list_is_empty(&module->pending_posts) == true); @@ -572,8 +574,15 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) { OSC_UCX_VERBOSE(1, "opal_common_ucx_worker_flush failed: %d", ret); } */ + opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0); + + DBG_OUT("ompi_osc_ucx_free: after mem_flush, mem = %p\n", (void *)module->mem); + ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module); + + DBG_OUT("ompi_osc_ucx_free: after barrier, mem = %p\n", (void *)module->mem); + /* for (i = 0; i < ompi_comm_size(module->comm); i++) { if ((module->win_info_array[i]).rkey_init == true) { From 66f85a30cfc40bd3a04ff0b479a738c766119c88 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 14:37:17 -0800 Subject: [PATCH 47/59] fixes on win_free --- ompi/mca/osc/ucx/osc_ucx_component.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index 01ad98aa8c6..2e2d0a6b6b9 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -563,17 +563,11 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) { OBJ_DESTRUCT(&module->outstanding_locks); OBJ_DESTRUCT(&module->pending_posts); - /* while (module->state.lock != TARGET_LOCK_UNLOCKED) { // not sure if this is required - ucp_worker_progress(mca_osc_ucx_component.ucp_worker); + // ucp_worker_progress(mca_osc_ucx_component.ucp_worker); } - ret = opal_common_ucx_worker_flush(mca_osc_ucx_component.ucp_worker); - if (OMPI_SUCCESS != ret) { - OSC_UCX_VERBOSE(1, "opal_common_ucx_worker_flush failed: %d", ret); - } -*/ opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0); DBG_OUT("ompi_osc_ucx_free: after mem_flush, mem = %p\n", (void *)module->mem); From bf023dbf12a7a6982df74016748034bf954ee169 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 15:22:50 -0800 Subject: [PATCH 48/59] fix on osc_win_free --- ompi/mca/osc/ucx/osc_ucx_component.c | 9 +++++---- opal/mca/common/ucx/common_ucx.c | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index 2e2d0a6b6b9..c53d3b90824 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -563,15 +563,16 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) { OBJ_DESTRUCT(&module->outstanding_locks); OBJ_DESTRUCT(&module->pending_posts); + opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0); + + DBG_OUT("ompi_osc_ucx_free: after mem_flush, mem = %p lock flag = %d\n", + (void *)module->mem, (int)module->state.lock); + while (module->state.lock != TARGET_LOCK_UNLOCKED) { // not sure if this is required // ucp_worker_progress(mca_osc_ucx_component.ucp_worker); } - opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0); - - DBG_OUT("ompi_osc_ucx_free: after mem_flush, mem = %p\n", (void *)module->mem); - ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module); diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index cdac47437e4..428519bebbf 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -1510,7 +1510,7 @@ int opal_common_ucx_mem_post(opal_common_ucx_mem_t *mem, len, rem_addr, rkey); if (status != UCS_OK) { opal_mutex_unlock(&winfo->mutex); - MCA_COMMON_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); + MCA_COMMON_UCX_VERBOSE(1, "ucp_atomic_post failed: %d", status); return OPAL_ERROR; } DBG_OUT("opal_common_ucx_mem_post(after ucp_atomic_post): ep = %p, rkey = %p\n", (void *)ep, (void *)rkey); From 94e0c8a3797671944fbef43d00d3b26c40e7b4f2 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 15:29:04 -0800 Subject: [PATCH 49/59] fixes --- ompi/mca/osc/ucx/osc_ucx_component.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index c53d3b90824..b98ff025504 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -555,7 +555,8 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; int ret; - DBG_OUT("ompi_osc_ucx_free: start, mem = %p\n", (void *)module->mem); + DBG_OUT("ompi_osc_ucx_free: start, mem = %p lock flag = %d\n", + (void *)module->mem, (int)module->state.lock); assert(module->global_ops_num == 0); assert(module->lock_count == 0); @@ -568,10 +569,12 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) { DBG_OUT("ompi_osc_ucx_free: after mem_flush, mem = %p lock flag = %d\n", (void *)module->mem, (int)module->state.lock); + /* while (module->state.lock != TARGET_LOCK_UNLOCKED) { // not sure if this is required // ucp_worker_progress(mca_osc_ucx_component.ucp_worker); } + */ ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module); From 676839eeec44303a4f7eeae98d0b53e667322c57 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 15:49:41 -0800 Subject: [PATCH 50/59] fixes --- ompi/mca/osc/ucx/osc_ucx_passive_target.c | 8 +------- opal/mca/common/ucx/common_ucx.c | 3 +++ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_passive_target.c b/ompi/mca/osc/ucx/osc_ucx_passive_target.c index 9a242f7f728..19a78ebaef5 100644 --- a/ompi/mca/osc/ucx/osc_ucx_passive_target.c +++ b/ompi/mca/osc/ucx/osc_ucx_passive_target.c @@ -24,13 +24,6 @@ static inline int start_shared(ompi_osc_ucx_module_t *module, int target) { // ucs_status_t status; // int ret; -/* - volatile int delay = 1; - while( delay ){ - sleep(1); - } -*/ - return OMPI_SUCCESS; while (true) { @@ -43,6 +36,7 @@ static inline int start_shared(ompi_osc_ucx_module_t *module, int target) { // return ret; // } + DBG_OUT("start_shared: after fadd, result_value = %d", (int)result_value); assert((int64_t)result_value >= 0); if (result_value >= TARGET_LOCK_EXCLUSIVE) { // status = ucp_atomic_post(ep, UCP_ATOMIC_POST_OP_ADD, (-1), sizeof(uint64_t), diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 428519bebbf..195c3ed3b57 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -1514,6 +1514,9 @@ int opal_common_ucx_mem_post(opal_common_ucx_mem_t *mem, return OPAL_ERROR; } DBG_OUT("opal_common_ucx_mem_post(after ucp_atomic_post): ep = %p, rkey = %p\n", (void *)ep, (void *)rkey); + + + opal_mutex_unlock(&winfo->mutex); return OPAL_SUCCESS; From c54b7a4b618ff5ae2ed0fafa1626035eb10ebc0a Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 15:54:11 -0800 Subject: [PATCH 51/59] fixes --- ompi/mca/osc/ucx/osc_ucx_passive_target.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_passive_target.c b/ompi/mca/osc/ucx/osc_ucx_passive_target.c index 19a78ebaef5..33155204101 100644 --- a/ompi/mca/osc/ucx/osc_ucx_passive_target.c +++ b/ompi/mca/osc/ucx/osc_ucx_passive_target.c @@ -24,8 +24,6 @@ static inline int start_shared(ompi_osc_ucx_module_t *module, int target) { // ucs_status_t status; // int ret; - return OMPI_SUCCESS; - while (true) { opal_common_ucx_mem_fetch(module->state_mem, UCP_ATOMIC_FETCH_OP_FADD, 1, target, @@ -37,6 +35,7 @@ static inline int start_shared(ompi_osc_ucx_module_t *module, int target) { // } DBG_OUT("start_shared: after fadd, result_value = %d", (int)result_value); + assert((int64_t)result_value >= 0); if (result_value >= TARGET_LOCK_EXCLUSIVE) { // status = ucp_atomic_post(ep, UCP_ATOMIC_POST_OP_ADD, (-1), sizeof(uint64_t), From 53d81f0f8205374c52507e50329ae5665e29f985 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 16:18:54 -0800 Subject: [PATCH 52/59] fixes --- opal/mca/common/ucx/common_ucx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 195c3ed3b57..315b466caed 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -1510,7 +1510,7 @@ int opal_common_ucx_mem_post(opal_common_ucx_mem_t *mem, len, rem_addr, rkey); if (status != UCS_OK) { opal_mutex_unlock(&winfo->mutex); - MCA_COMMON_UCX_VERBOSE(1, "ucp_atomic_post failed: %d", status); + MCA_COMMON_UCX_ERROR("ucp_atomic_post failed: %d", status); return OPAL_ERROR; } DBG_OUT("opal_common_ucx_mem_post(after ucp_atomic_post): ep = %p, rkey = %p\n", (void *)ep, (void *)rkey); From 3d212a37260820ccf419b00616ab7c02b28a8a07 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 16:45:59 -0800 Subject: [PATCH 53/59] fix --- opal/mca/common/ucx/common_ucx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 315b466caed..b3a12786f28 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -588,7 +588,7 @@ int opal_common_ucx_ctx_create(opal_common_ucx_wpool_t *wpool, int comm_size, opal_common_ucx_ctx_t *ctx = calloc(1, sizeof(*ctx)); int ret = OPAL_SUCCESS; - OPAL_ATOMIC_ADD_FETCH32(&ctx->ctx_id, 1); + ctx->ctx_id = OPAL_ATOMIC_ADD_FETCH32(&wpool->cur_ctxid, 1); DBG_OUT("ctx_create: ctx_id = %d\n", (int)ctx->ctx_id); OBJ_CONSTRUCT(&ctx->mutex, opal_mutex_t); @@ -746,7 +746,8 @@ int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int comm_size, ucs_status_t status; int ret = OPAL_SUCCESS; - OPAL_ATOMIC_ADD_FETCH32(&mem->mem_id, 1); + mem->mem_id = OPAL_ATOMIC_ADD_FETCH32(&ctx->wpool->cur_memid, 1); + DBG_OUT("mem_create: mem_id = %d\n", (int)mem->mem_id); OBJ_CONSTRUCT(&mem->mutex, opal_mutex_t); @@ -935,8 +936,6 @@ static void _common_ucx_tls_cleanup(_tlocal_table_t *tls) return; } - - static int _tlocal_tls_get_worker(_tlocal_table_t *tls, _worker_info_t **_winfo) { @@ -1052,7 +1051,7 @@ _tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx) } } - if( tls->ctx_tbl_size >= i ){ + if( i >= tls->ctx_tbl_size ){ i = tls->ctx_tbl_size; rc = _tlocal_tls_ctxtbl_extend(tls, 4); if (rc) { @@ -1060,6 +1059,7 @@ _tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx) return NULL; } } + tls->ctx_tbl[i]->ctx_id = ctx->ctx_id; tls->ctx_tbl[i]->gctx = ctx; rc = _tlocal_tls_get_worker(tls, &tls->ctx_tbl[i]->winfo); @@ -1214,7 +1214,7 @@ static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls, } } - if( tls->mem_tbl_size >= i ){ + if( i >= tls->mem_tbl_size ){ i = tls->mem_tbl_size; rc = _tlocal_tls_memtbl_extend(tls, 4); if (rc != OPAL_SUCCESS) { From 492417ec21c16d6a481fc4b8175c16fa5463ecf8 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 17:40:21 -0800 Subject: [PATCH 54/59] fix race --- opal/mca/common/ucx/common_ucx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index b3a12786f28..6f5539c8f73 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -328,11 +328,11 @@ static inline void _cleanup_tlocal(void *arg) OPAL_LIST_FOREACH_SAFE(item, next, &wpool->tls_list, _tlocal_table_t) { if (item == tls) { opal_list_remove_item(&wpool->tls_list, &item->super); - _common_ucx_tls_cleanup(tls); break; } } opal_mutex_unlock(&wpool->mutex); + _common_ucx_tls_cleanup(tls); } static From 755bf46ac61521fa1bf92fa4aaa205f4e383c371 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 18:14:44 -0800 Subject: [PATCH 55/59] fixes --- opal/mca/common/ucx/common_ucx.c | 66 +++++++++++++++----------------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 6f5539c8f73..93dd22c0d1e 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -1382,27 +1382,30 @@ opal_common_ucx_mem_putget(opal_common_ucx_mem_t *mem, switch(op){ case OPAL_COMMON_UCX_PUT: status = ucp_put_nbi(ep, buffer,len, rem_addr, rkey); + // TODO: movethis duplicated if-else out of switch + // char *func = "ucp_put_nbi"; + // verbose("... func = %s...", func); if (status != UCS_OK && status != UCS_INPROGRESS) { MCA_COMMON_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status); - opal_mutex_unlock(&winfo->mutex); - return OPAL_ERROR; + rc = OPAL_ERROR; + } else { + DBG_OUT("opal_common_ucx_mem_putget(after ucp_put_nbi): ep = %p, rkey = %p\n", + (void *)ep, (void *)rkey); } - DBG_OUT("opal_common_ucx_mem_putget(after ucp_put_nbi): ep = %p, rkey = %p\n", - (void *)ep, (void *)rkey); break; case OPAL_COMMON_UCX_GET: status = ucp_get_nbi(ep, buffer,len, rem_addr, rkey); if (status != UCS_OK && status != UCS_INPROGRESS) { MCA_COMMON_UCX_VERBOSE(1, "ucp_get_nbi failed: %d", status); - opal_mutex_unlock(&winfo->mutex); - return OPAL_ERROR; + rc = OPAL_ERROR; + } else { + DBG_OUT("opal_common_ucx_mem_putget(after ucp_get_nbi): ep = %p, rkey = %p\n", + (void *)ep, (void *)rkey); } - DBG_OUT("opal_common_ucx_mem_putget(after ucp_get_nbi): ep = %p, rkey = %p\n", - (void *)ep, (void *)rkey); break; } opal_mutex_unlock(&winfo->mutex); - return OPAL_SUCCESS; + return rc; } @@ -1434,14 +1437,14 @@ int opal_common_ucx_mem_cmpswp(opal_common_ucx_mem_t *mem, winfo->worker); if (status != UCS_OK) { MCA_COMMON_UCX_VERBOSE(1, "opal_common_ucx_atomic_cswap failed: %d", status); - opal_mutex_unlock(&winfo->mutex); - return OPAL_ERROR; + rc = OPAL_ERROR; + } else { + DBG_OUT("opal_common_ucx_mem_cmpswp(after opal_common_ucx_atomic_cswap): ep = %p, rkey = %p\n", + (void *)ep, (void *)rkey); } - DBG_OUT("opal_common_ucx_mem_cmpswp(after opal_common_ucx_atomic_cswap): ep = %p, rkey = %p\n", - (void *)ep, (void *)rkey); - opal_mutex_unlock(&winfo->mutex); - return OPAL_SUCCESS; + + return rc; } OPAL_DECLSPEC @@ -1471,16 +1474,15 @@ int opal_common_ucx_mem_fetch(opal_common_ucx_mem_t *mem, rem_addr, rkey, winfo->worker); if (status != UCS_OK) { - opal_mutex_unlock(&winfo->mutex); MCA_COMMON_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); - return OPAL_ERROR; + rc = OPAL_ERROR; + } else { + DBG_OUT("opal_common_ucx_mem_fetch(after opal_common_ucx_atomic_fetch): ep = %p, rkey = %p\n", + (void *)ep, (void *)rkey); } - DBG_OUT("opal_common_ucx_mem_fetch(after opal_common_ucx_atomic_fetch): ep = %p, rkey = %p\n", - (void *)ep, (void *)rkey); - opal_mutex_unlock(&winfo->mutex); - return OPAL_SUCCESS; + return rc; } @@ -1496,6 +1498,7 @@ int opal_common_ucx_mem_post(opal_common_ucx_mem_t *mem, ucs_status_t status; int rc = OPAL_SUCCESS; + rc =_tlocal_fetch(mem, target, &ep, &rkey, &winfo); if(OPAL_SUCCESS != rc){ MCA_COMMON_UCX_VERBOSE(1, "tlocal_fetch failed: %d", rc); @@ -1509,17 +1512,13 @@ int opal_common_ucx_mem_post(opal_common_ucx_mem_t *mem, status = ucp_atomic_post(ep, opcode, value, len, rem_addr, rkey); if (status != UCS_OK) { - opal_mutex_unlock(&winfo->mutex); MCA_COMMON_UCX_ERROR("ucp_atomic_post failed: %d", status); - return OPAL_ERROR; + rc = OPAL_ERROR; + } else { + DBG_OUT("opal_common_ucx_mem_post(after ucp_atomic_post): ep = %p, rkey = %p\n", (void *)ep, (void *)rkey); } - DBG_OUT("opal_common_ucx_mem_post(after ucp_atomic_post): ep = %p, rkey = %p\n", (void *)ep, (void *)rkey); - - - opal_mutex_unlock(&winfo->mutex); - - return OPAL_SUCCESS; + return rc; } OPAL_DECLSPEC int @@ -1533,6 +1532,7 @@ opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, DBG_OUT("opal_common_ucx_mem_flush: mem = %p, target = %d\n", (void *)mem, target); + // TODO: make this as a read lock opal_mutex_lock(&ctx->mutex); OPAL_LIST_FOREACH(item, &ctx->workers, _worker_list_item_t) { switch (scope) { @@ -1540,10 +1540,8 @@ opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, opal_mutex_lock(&item->ptr->winfo->mutex); rc = opal_common_ucx_worker_flush(item->ptr->winfo->worker); if (rc != OPAL_SUCCESS) { - opal_mutex_unlock(&item->ptr->winfo->mutex); - opal_mutex_unlock(&ctx->mutex); MCA_COMMON_UCX_VERBOSE(1, "opal_common_ucx_worker_flush failed: %d", rc); - return OPAL_ERROR; + rc = OPAL_ERROR; } DBG_OUT("opal_common_ucx_mem_flush(after opal_common_ucx_worker_flush): worker = %p\n", (void *)item->ptr->winfo->worker); @@ -1555,10 +1553,8 @@ opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, rc = opal_common_ucx_ep_flush(item->ptr->winfo->endpoints[target], item->ptr->winfo->worker); if (rc != OPAL_SUCCESS) { - opal_mutex_unlock(&item->ptr->winfo->mutex); - opal_mutex_unlock(&ctx->mutex); MCA_COMMON_UCX_VERBOSE(1, "opal_common_ucx_ep_flush failed: %d", rc); - return OPAL_ERROR; + rc = OPAL_ERROR; } DBG_OUT("opal_common_ucx_mem_flush(after opal_common_ucx_worker_flush): ep = %p worker = %p\n", (void *)item->ptr->winfo->endpoints[target], From 6673f47f9f836ae090b087e3c260fef150b59966 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 21:43:33 -0800 Subject: [PATCH 56/59] fixes on worker progress --- ompi/mca/osc/ucx/osc_ucx_component.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index b98ff025504..a8bd78b78ce 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -569,12 +569,9 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) { DBG_OUT("ompi_osc_ucx_free: after mem_flush, mem = %p lock flag = %d\n", (void *)module->mem, (int)module->state.lock); - /* while (module->state.lock != TARGET_LOCK_UNLOCKED) { - // not sure if this is required - // ucp_worker_progress(mca_osc_ucx_component.ucp_worker); + ucp_worker_progress(mca_osc_ucx_component.wpool->recv_worker); } - */ ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module); From c5df65751d21db899defeba7d356c0c4db9a7f71 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 19 Nov 2018 21:51:00 -0800 Subject: [PATCH 57/59] fixes --- ompi/mca/osc/ucx/osc_ucx_component.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_component.c b/ompi/mca/osc/ucx/osc_ucx_component.c index a8bd78b78ce..793ebdb763d 100644 --- a/ompi/mca/osc/ucx/osc_ucx_component.c +++ b/ompi/mca/osc/ucx/osc_ucx_component.c @@ -120,7 +120,7 @@ static int component_register(void) { static int progress_callback(void) { if (mca_osc_ucx_component.wpool != NULL) { - opal_common_ucx_workers_progress(mca_osc_ucx_component.wpool); + ucp_worker_progress(mca_osc_ucx_component.wpool->recv_worker); } return 0; } @@ -569,9 +569,11 @@ int ompi_osc_ucx_free(struct ompi_win_t *win) { DBG_OUT("ompi_osc_ucx_free: after mem_flush, mem = %p lock flag = %d\n", (void *)module->mem, (int)module->state.lock); + /* while (module->state.lock != TARGET_LOCK_UNLOCKED) { ucp_worker_progress(mca_osc_ucx_component.wpool->recv_worker); } + */ ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module); From 989d86f4236dbb4e455ed5a2460d97ab4be3f719 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Tue, 20 Nov 2018 17:48:58 -0800 Subject: [PATCH 58/59] modify on osc comm code --- ompi/mca/osc/ucx/osc_ucx_comm.c | 326 +++++++++++------------------ ompi/mca/osc/ucx/osc_ucx_request.h | 5 +- opal/mca/common/ucx/common_ucx.c | 37 +++- opal/mca/common/ucx/common_ucx.h | 22 +- 4 files changed, 181 insertions(+), 209 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_comm.c b/ompi/mca/osc/ucx/osc_ucx_comm.c index 2e57c34a136..b315e281d06 100644 --- a/ompi/mca/osc/ucx/osc_ucx_comm.c +++ b/ompi/mca/osc/ucx/osc_ucx_comm.c @@ -135,7 +135,7 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module, const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, bool is_origin_contig, ptrdiff_t origin_lb, - int target, ucp_ep_h ep, uint64_t remote_addr, ucp_rkey_h rkey, + int target, uint64_t remote_addr, int target_count, struct ompi_datatype_t *target_dt, bool is_target_contig, ptrdiff_t target_lb, bool is_get) { ucx_iovec_t *origin_ucx_iov = NULL, *target_ucx_iov = NULL; @@ -162,19 +162,20 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module, if (!is_origin_contig && !is_target_contig) { size_t curr_len = 0; + opal_common_ucx_op_t op; while (origin_ucx_iov_idx < origin_ucx_iov_count) { curr_len = MIN(origin_ucx_iov[origin_ucx_iov_idx].len, target_ucx_iov[target_ucx_iov_idx].len); - opal_common_ucx_op_t op = OPAL_COMMON_UCX_PUT; if (is_get) { op = OPAL_COMMON_UCX_GET; + } else { + op = OPAL_COMMON_UCX_PUT; } - status = opal_common_ucx_mem_putget(module->mem, op, - target, + status = opal_common_ucx_mem_putget(module->mem, op, target, origin_ucx_iov[origin_ucx_iov_idx].addr, curr_len, remote_addr + (uint64_t)(target_ucx_iov[target_ucx_iov_idx].addr)); if (OPAL_SUCCESS != status) { - OSC_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status); + OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_putget failed: %d", status); return OMPI_ERROR; } @@ -200,28 +201,24 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module, target_ucx_iov_idx == target_ucx_iov_count); } else if (!is_origin_contig) { - /* size_t prev_len = 0; + opal_common_ucx_op_t op; while (origin_ucx_iov_idx < origin_ucx_iov_count) { - if (!is_get) { - status = ucp_put_nbi(ep, origin_ucx_iov[origin_ucx_iov_idx].addr, - origin_ucx_iov[origin_ucx_iov_idx].len, - remote_addr + target_lb + prev_len, rkey); - if (status != UCS_OK && status != UCS_INPROGRESS) { - OSC_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status); - return OMPI_ERROR; - } + if (is_get) { + op = OPAL_COMMON_UCX_GET; } else { - status = ucp_get_nbi(ep, origin_ucx_iov[origin_ucx_iov_idx].addr, - origin_ucx_iov[origin_ucx_iov_idx].len, - remote_addr + target_lb + prev_len, rkey); - if (status != UCS_OK && status != UCS_INPROGRESS) { - OSC_UCX_VERBOSE(1, "ucp_get_nbi failed: %d", status); - return OMPI_ERROR; - } + op = OPAL_COMMON_UCX_PUT; + } + status = opal_common_ucx_mem_putget(module->mem, op, target, + origin_ucx_iov[origin_ucx_iov_idx].addr, + origin_ucx_iov[origin_ucx_iov_idx].len, + remote_addr + target_lb + prev_len); + if (OPAL_SUCCESS != status) { + OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_putget failed: %d", status); + return OMPI_ERROR; } - ret = incr_and_check_ops_num(module, target, ep); + ret = incr_and_check_ops_num(module, target); if (ret != OMPI_SUCCESS) { return ret; } @@ -229,30 +226,25 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module, prev_len += origin_ucx_iov[origin_ucx_iov_idx].len; origin_ucx_iov_idx++; } - */ } else { - /* size_t prev_len = 0; + opal_common_ucx_op_t op; while (target_ucx_iov_idx < target_ucx_iov_count) { - if (!is_get) { - status = ucp_put_nbi(ep, (void *)((intptr_t)origin_addr + origin_lb + prev_len), - target_ucx_iov[target_ucx_iov_idx].len, - remote_addr + (uint64_t)(target_ucx_iov[target_ucx_iov_idx].addr), rkey); - if (status != UCS_OK && status != UCS_INPROGRESS) { - OSC_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status); - return OMPI_ERROR; - } + if (is_get) { + op = OPAL_COMMON_UCX_GET; } else { - status = ucp_get_nbi(ep, (void *)((intptr_t)origin_addr + origin_lb + prev_len), - target_ucx_iov[target_ucx_iov_idx].len, - remote_addr + (uint64_t)(target_ucx_iov[target_ucx_iov_idx].addr), rkey); - if (status != UCS_OK && status != UCS_INPROGRESS) { - OSC_UCX_VERBOSE(1, "ucp_get_nbi failed: %d", status); - return OMPI_ERROR; - } + op = OPAL_COMMON_UCX_PUT; } - ret = incr_and_check_ops_num(module, target, ep); + status = opal_common_ucx_mem_putget(module->mem, op, target, + (void *)((intptr_t)origin_addr + origin_lb + prev_len), + target_ucx_iov[target_ucx_iov_idx].len, + remote_addr + (uint64_t)(target_ucx_iov[target_ucx_iov_idx].addr)); + if (OPAL_SUCCESS != status) { + OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_putget failed: %d", status); + return OMPI_ERROR; + } + ret = incr_and_check_ops_num(module, target); if (ret != OMPI_SUCCESS) { return ret; } @@ -260,7 +252,6 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module, prev_len += target_ucx_iov[target_ucx_iov_idx].len; target_ucx_iov_idx++; } - */ } if (origin_ucx_iov != NULL) { @@ -273,50 +264,42 @@ static inline int ddt_put_get(ompi_osc_ucx_module_t *module, return ret; } -static inline int start_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, int target) { +static inline int start_atomicity(ompi_osc_ucx_module_t *module, int target) { uint64_t result_value = -1; uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_ACC_LOCK_OFFSET; -// ucs_status_t status; + int ret = OMPI_SUCCESS; while (result_value != TARGET_LOCK_UNLOCKED) { - // TODO: - /*status = */opal_common_ucx_mem_cmpswp(module->state_mem, - TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE, - target, - &result_value, sizeof(result_value), - remote_addr); -/* - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); + ret = opal_common_ucx_mem_cmpswp(module->state_mem, + TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE, + target, &result_value, sizeof(result_value), + remote_addr); + if (ret != OMPI_SUCCESS) { + OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_cmpswp failed: %d", ret); return OMPI_ERROR; } -*/ } - return OMPI_SUCCESS; + return ret; } -static inline int end_atomicity(ompi_osc_ucx_module_t *module, ucp_ep_h ep, int target) { +static inline int end_atomicity(ompi_osc_ucx_module_t *module, int target) { uint64_t result_value = 0; uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_ACC_LOCK_OFFSET; -// int ret; - - // TODO: - /*status = */opal_common_ucx_mem_fetch(module->state_mem, - UCP_ATOMIC_FETCH_OP_SWAP, TARGET_LOCK_UNLOCKED, - target, - &result_value, sizeof(result_value), - remote_addr); -/* - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); + int ret = OMPI_SUCCESS; + + ret = opal_common_ucx_mem_fetch(module->state_mem, + UCP_ATOMIC_FETCH_OP_SWAP, TARGET_LOCK_UNLOCKED, + target, &result_value, sizeof(result_value), + remote_addr); + if (ret != OMPI_SUCCESS) { + OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_fetch failed: %d", ret); return OMPI_ERROR; } -*/ assert(result_value == TARGET_LOCK_EXCLUSIVE); - return OMPI_SUCCESS; + return ret; } static inline int get_dynamic_win_info(uint64_t remote_addr, ompi_osc_ucx_module_t *module, @@ -379,21 +362,20 @@ int ompi_osc_ucx_put(const void *origin_addr, int origin_count, struct ompi_data uint64_t remote_addr = (module->addrs[target]) + target_disp * OSC_UCX_GET_DISP(module, target); bool is_origin_contig = false, is_target_contig = false; ptrdiff_t origin_lb, origin_extent, target_lb, target_extent; - /*ucs_status_t*/ int status; int ret = OMPI_SUCCESS; ret = check_sync_state(module, target, false); if (ret != OMPI_SUCCESS) { return ret; } -/* - if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { + +/* if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { status = get_dynamic_win_info(remote_addr, module, ep, target); if (status != UCS_OK) { return OMPI_ERROR; } - } -*/ + } */ + if (!target_count) { return OMPI_SUCCESS; } @@ -411,18 +393,17 @@ int ompi_osc_ucx_put(const void *origin_addr, int origin_count, struct ompi_data ompi_datatype_type_size(origin_dt, &origin_len); origin_len *= origin_count; - status = opal_common_ucx_mem_putget(module->mem, OPAL_COMMON_UCX_PUT, - target, - (void *)((intptr_t)origin_addr + origin_lb), - origin_len, remote_addr + target_lb); - if (OPAL_SUCCESS != status) { - OSC_UCX_VERBOSE(1, "ucp_put_nbi failed: %d", status); + ret = opal_common_ucx_mem_putget(module->mem, OPAL_COMMON_UCX_PUT, target, + (void *)((intptr_t)origin_addr + origin_lb), + origin_len, remote_addr + target_lb); + if (OPAL_SUCCESS != ret) { + OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_putget failed: %d", ret); return OMPI_ERROR; } return incr_and_check_ops_num(module, target); } else { return ddt_put_get(module, origin_addr, origin_count, origin_dt, is_origin_contig, - origin_lb, target, NULL, remote_addr, NULL, target_count, target_dt, + origin_lb, target, remote_addr, target_count, target_dt, is_target_contig, target_lb, false); } } @@ -431,14 +412,10 @@ int ompi_osc_ucx_get(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win) { -/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); - uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target); - ucp_rkey_h rkey; + uint64_t remote_addr = (module->addrs[target]) + target_disp * OSC_UCX_GET_DISP(module, target); ptrdiff_t origin_lb, origin_extent, target_lb, target_extent; bool is_origin_contig = false, is_target_contig = false; - ucs_status_t status; int ret = OMPI_SUCCESS; ret = check_sync_state(module, target, false); @@ -446,20 +423,17 @@ int ompi_osc_ucx_get(void *origin_addr, int origin_count, return ret; } - if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { +/* if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { status = get_dynamic_win_info(remote_addr, module, ep, target); if (status != UCS_OK) { return OMPI_ERROR; } - } - - //CHECK_VALID_RKEY(module, target, target_count); + } */ if (!target_count) { return OMPI_SUCCESS; } -// rkey = (module->win_info_array[target]).rkey; ompi_datatype_get_true_extent(origin_dt, &origin_lb, &origin_extent); ompi_datatype_get_true_extent(target_dt, &target_lb, &target_extent); @@ -468,27 +442,26 @@ int ompi_osc_ucx_get(void *origin_addr, int origin_count, is_target_contig = ompi_datatype_is_contiguous_memory_layout(target_dt, target_count); if (is_origin_contig && is_target_contig) { - // fast path + /* fast path */ size_t origin_len; ompi_datatype_type_size(origin_dt, &origin_len); origin_len *= origin_count; - status = ucp_get_nbi(ep, (void *)((intptr_t)origin_addr + origin_lb), origin_len, - remote_addr + target_lb, rkey); - if (status != UCS_OK && status != UCS_INPROGRESS) { - OSC_UCX_VERBOSE(1, "ucp_get_nbi failed: %d", status); + ret = opal_common_ucx_mem_putget(module->mem, OPAL_COMMON_UCX_GET, target, + (void *)((intptr_t)origin_addr + origin_lb), + origin_len, remote_addr + target_lb); + if (OPAL_SUCCESS != ret) { + OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_putget failed: %d", ret); return OMPI_ERROR; } return incr_and_check_ops_num(module, target); } else { return ddt_put_get(module, origin_addr, origin_count, origin_dt, is_origin_contig, - origin_lb, target, ep, remote_addr, rkey, target_count, target_dt, + origin_lb, target, remote_addr, target_count, target_dt, is_target_contig, target_lb, true); } -*/ - return OMPI_SUCCESS; } int ompi_osc_ucx_accumulate(const void *origin_addr, int origin_count, @@ -496,9 +469,7 @@ int ompi_osc_ucx_accumulate(const void *origin_addr, int origin_count, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win) { -/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); int ret = OMPI_SUCCESS; ret = check_sync_state(module, target, false); @@ -510,7 +481,7 @@ int ompi_osc_ucx_accumulate(const void *origin_addr, int origin_count, return ret; } - ret = start_atomicity(module, ep, target); + ret = start_atomicity(module, target); if (ret != OMPI_SUCCESS) { return ret; } @@ -550,7 +521,7 @@ int ompi_osc_ucx_accumulate(const void *origin_addr, int origin_count, return ret; } - ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker); + ret = opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target); if (ret != OMPI_SUCCESS) { return ret; } @@ -604,7 +575,7 @@ int ompi_osc_ucx_accumulate(const void *origin_addr, int origin_count, return ret; } - ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker); + ret = opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target); if (ret != OMPI_SUCCESS) { return ret; } @@ -612,52 +583,41 @@ int ompi_osc_ucx_accumulate(const void *origin_addr, int origin_count, free(temp_addr_holder); } - ret = end_atomicity(module, ep, target); - - return ret; - */ - return OMPI_SUCCESS; + return end_atomicity(module, target); } int ompi_osc_ucx_compare_and_swap(const void *origin_addr, const void *compare_addr, void *result_addr, struct ompi_datatype_t *dt, int target, ptrdiff_t target_disp, struct ompi_win_t *win) { -/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module; - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); - uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target); - ucp_rkey_h rkey; + uint64_t remote_addr = (module->addrs[target]) + target_disp * OSC_UCX_GET_DISP(module, target); size_t dt_bytes; - ompi_osc_ucx_internal_request_t *req = NULL; int ret = OMPI_SUCCESS; - ucs_status_t status; ret = check_sync_state(module, target, false); if (ret != OMPI_SUCCESS) { return ret; } - ret = start_atomicity(module, ep, target); + ret = start_atomicity(module, target); if (ret != OMPI_SUCCESS) { return ret; } - if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { +/* if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { status = get_dynamic_win_info(remote_addr, module, ep, target); if (status != UCS_OK) { return OMPI_ERROR; } - } - - rkey = (module->win_info_array[target]).rkey; + } */ ompi_datatype_type_size(dt, &dt_bytes); - memcpy(result_addr, origin_addr, dt_bytes); - req = ucp_atomic_fetch_nb(ep, UCP_ATOMIC_FETCH_OP_CSWAP, *(uint64_t *)compare_addr, - result_addr, dt_bytes, remote_addr, rkey, req_completion); - if (UCS_PTR_IS_PTR(req)) { - ucp_request_release(req); + ret = opal_common_ucx_mem_cmpswp(module->mem,*(uint64_t *)compare_addr, + *(uint64_t *)origin_addr, target, + result_addr, dt_bytes, remote_addr); + if (ret != OMPI_SUCCESS) { + return ret; } ret = incr_and_check_ops_num(module, target); @@ -665,17 +625,13 @@ int ompi_osc_ucx_compare_and_swap(const void *origin_addr, const void *compare_a return ret; } - return end_atomicity(module, ep, target); - */ - return OMPI_SUCCESS; + return end_atomicity(module, target); } int ompi_osc_ucx_fetch_and_op(const void *origin_addr, void *result_addr, struct ompi_datatype_t *dt, int target, ptrdiff_t target_disp, struct ompi_op_t *op, struct ompi_win_t *win) { - -/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; int ret = OMPI_SUCCESS; @@ -686,28 +642,22 @@ int ompi_osc_ucx_fetch_and_op(const void *origin_addr, void *result_addr, if (op == &ompi_mpi_op_no_op.op || op == &ompi_mpi_op_replace.op || op == &ompi_mpi_op_sum.op) { - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); - uint64_t remote_addr = (module->win_info_array[target]).addr + target_disp * OSC_UCX_GET_DISP(module, target); - ucp_rkey_h rkey; + uint64_t remote_addr = (module->addrs[target]) + target_disp * OSC_UCX_GET_DISP(module, target); uint64_t value = *(uint64_t *)origin_addr; ucp_atomic_fetch_op_t opcode; size_t dt_bytes; - ompi_osc_ucx_internal_request_t *req = NULL; - ucs_status_t status; - ret = start_atomicity(module, ep, target); + ret = start_atomicity(module, target); if (ret != OMPI_SUCCESS) { return ret; } - if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { +/* if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { status = get_dynamic_win_info(remote_addr, module, ep, target); if (status != UCS_OK) { return OMPI_ERROR; } - } - - rkey = (module->win_info_array[target]).rkey; + } */ ompi_datatype_type_size(dt, &dt_bytes); @@ -720,10 +670,10 @@ int ompi_osc_ucx_fetch_and_op(const void *origin_addr, void *result_addr, } } - req = ucp_atomic_fetch_nb(ep, opcode, value, result_addr, - dt_bytes, remote_addr, rkey, req_completion); - if (UCS_PTR_IS_PTR(req)) { - ucp_request_release(req); + ret = opal_common_ucx_mem_fetch(module->mem, opcode, value, target, + (void *)origin_addr, dt_bytes, remote_addr); + if (ret != OMPI_SUCCESS) { + return ret; } ret = incr_and_check_ops_num(module, target); @@ -731,13 +681,11 @@ int ompi_osc_ucx_fetch_and_op(const void *origin_addr, void *result_addr, return ret; } - return end_atomicity(module, ep, target); + return end_atomicity(module, target); } else { return ompi_osc_ucx_get_accumulate(origin_addr, 1, dt, result_addr, 1, dt, target, target_disp, 1, dt, op, win); } - */ - return OMPI_SUCCESS; } int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count, @@ -747,10 +695,7 @@ int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win) { -/* - ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); int ret = OMPI_SUCCESS; ret = check_sync_state(module, target, false); @@ -758,7 +703,7 @@ int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count, return ret; } - ret = start_atomicity(module, ep, target); + ret = start_atomicity(module, target); if (ret != OMPI_SUCCESS) { return ret; } @@ -806,7 +751,7 @@ int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count, return ret; } - ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker); + ret = opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target); if (ret != OMPI_SUCCESS) { return ret; } @@ -859,7 +804,7 @@ int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count, return ret; } - ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker); + ret = opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target); if (ret != OMPI_SUCCESS) { return ret; } @@ -868,11 +813,7 @@ int ompi_osc_ucx_get_accumulate(const void *origin_addr, int origin_count, } } - ret = end_atomicity(module, ep, target); - - return ret; - */ - return OMPI_SUCCESS; + return end_atomicity(module, target); } int ompi_osc_ucx_rput(const void *origin_addr, int origin_count, @@ -880,15 +821,10 @@ int ompi_osc_ucx_rput(const void *origin_addr, int origin_count, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win, struct ompi_request_t **request) { - - /* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); - uint64_t remote_addr = (module->state_info_array[target]).addr + OSC_UCX_STATE_REQ_FLAG_OFFSET; - ucp_rkey_h rkey; + uint64_t remote_addr = (module->addrs[target]) + target_disp * OSC_UCX_GET_DISP(module, target); ompi_osc_ucx_request_t *ucx_req = NULL; ompi_osc_ucx_internal_request_t *internal_req = NULL; - ucs_status_t status; int ret = OMPI_SUCCESS; ret = check_sync_state(module, target, true); @@ -896,16 +832,12 @@ int ompi_osc_ucx_rput(const void *origin_addr, int origin_count, return ret; } - if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { +/* if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { status = get_dynamic_win_info(remote_addr, module, ep, target); if (status != UCS_OK) { return OMPI_ERROR; } - } - - CHECK_VALID_RKEY(module, target, target_count); - - rkey = (module->win_info_array[target]).rkey; + } */ OMPI_OSC_UCX_REQUEST_ALLOC(win, ucx_req); assert(NULL != ucx_req); @@ -916,15 +848,19 @@ int ompi_osc_ucx_rput(const void *origin_addr, int origin_count, return ret; } - status = ucp_worker_fence(mca_osc_ucx_component.ucp_worker); - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_worker_fence failed: %d", status); + ret = opal_common_ucx_mem_fence(module->mem); + if (ret != OMPI_SUCCESS) { + OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_fence failed: %d", ret); return OMPI_ERROR; } - internal_req = ucp_atomic_fetch_nb(ep, UCP_ATOMIC_FETCH_OP_FADD, 0, - &(module->req_result), sizeof(uint64_t), - remote_addr, rkey, req_completion); + ret = opal_common_ucx_mem_fetch_nb(module->mem, UCP_ATOMIC_FETCH_OP_FADD, + 0, target, &(module->req_result), + sizeof(uint64_t), remote_addr, + (ucs_status_ptr_t *)&internal_req); + if (ret != OMPI_SUCCESS) { + return ret; + } if (UCS_PTR_IS_PTR(internal_req)) { internal_req->external_req = ucx_req; @@ -936,8 +872,6 @@ int ompi_osc_ucx_rput(const void *origin_addr, int origin_count, *request = &ucx_req->super; return incr_and_check_ops_num(module, target); - */ - return OMPI_SUCCESS; } int ompi_osc_ucx_rget(void *origin_addr, int origin_count, @@ -945,14 +879,10 @@ int ompi_osc_ucx_rget(void *origin_addr, int origin_count, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win, struct ompi_request_t **request) { -/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); - uint64_t remote_addr = (module->state_info_array[target]).addr + OSC_UCX_STATE_REQ_FLAG_OFFSET; - ucp_rkey_h rkey; + uint64_t remote_addr = (module->addrs[target]) + target_disp * OSC_UCX_GET_DISP(module, target); ompi_osc_ucx_request_t *ucx_req = NULL; ompi_osc_ucx_internal_request_t *internal_req = NULL; - ucs_status_t status; int ret = OMPI_SUCCESS; ret = check_sync_state(module, target, true); @@ -960,16 +890,12 @@ int ompi_osc_ucx_rget(void *origin_addr, int origin_count, return ret; } - if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { +/* if (module->flavor == MPI_WIN_FLAVOR_DYNAMIC) { status = get_dynamic_win_info(remote_addr, module, ep, target); if (status != UCS_OK) { return OMPI_ERROR; } - } - - CHECK_VALID_RKEY(module, target, target_count); - - rkey = (module->win_info_array[target]).rkey; + } */ OMPI_OSC_UCX_REQUEST_ALLOC(win, ucx_req); assert(NULL != ucx_req); @@ -980,15 +906,19 @@ int ompi_osc_ucx_rget(void *origin_addr, int origin_count, return ret; } - status = ucp_worker_fence(mca_osc_ucx_component.ucp_worker); - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_worker_fence failed: %d", status); + ret = opal_common_ucx_mem_fence(module->mem); + if (ret != OMPI_SUCCESS) { + OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_fence failed: %d", ret); return OMPI_ERROR; } - internal_req = ucp_atomic_fetch_nb(ep, UCP_ATOMIC_FETCH_OP_FADD, 0, - &(module->req_result), sizeof(uint64_t), - remote_addr, rkey, req_completion); + ret = opal_common_ucx_mem_fetch_nb(module->mem, UCP_ATOMIC_FETCH_OP_FADD, + 0, target, &(module->req_result), + sizeof(uint64_t), remote_addr, + (ucs_status_ptr_t *)&internal_req); + if (ret != OMPI_SUCCESS) { + return ret; + } if (UCS_PTR_IS_PTR(internal_req)) { internal_req->external_req = ucx_req; @@ -1000,8 +930,6 @@ int ompi_osc_ucx_rget(void *origin_addr, int origin_count, *request = &ucx_req->super; return incr_and_check_ops_num(module, target); - */ - return OMPI_SUCCESS; } int ompi_osc_ucx_raccumulate(const void *origin_addr, int origin_count, @@ -1009,7 +937,6 @@ int ompi_osc_ucx_raccumulate(const void *origin_addr, int origin_count, int target, ptrdiff_t target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win, struct ompi_request_t **request) { -/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; ompi_osc_ucx_request_t *ucx_req = NULL; int ret = OMPI_SUCCESS; @@ -1032,8 +959,6 @@ int ompi_osc_ucx_raccumulate(const void *origin_addr, int origin_count, *request = &ucx_req->super; return ret; - */ - return OMPI_SUCCESS; } int ompi_osc_ucx_rget_accumulate(const void *origin_addr, int origin_count, @@ -1044,7 +969,6 @@ int ompi_osc_ucx_rget_accumulate(const void *origin_addr, int origin_count, struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, struct ompi_win_t *win, struct ompi_request_t **request) { -/* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; ompi_osc_ucx_request_t *ucx_req = NULL; int ret = OMPI_SUCCESS; @@ -1070,6 +994,4 @@ int ompi_osc_ucx_rget_accumulate(const void *origin_addr, int origin_count, *request = &ucx_req->super; return ret; - */ - return OMPI_SUCCESS; } diff --git a/ompi/mca/osc/ucx/osc_ucx_request.h b/ompi/mca/osc/ucx/osc_ucx_request.h index b33bc54c2de..86934ae30eb 100644 --- a/ompi/mca/osc/ucx/osc_ucx_request.h +++ b/ompi/mca/osc/ucx/osc_ucx_request.h @@ -32,9 +32,8 @@ typedef struct ompi_osc_ucx_internal_request { do { \ item = opal_free_list_get(&mca_osc_ucx_component.requests); \ if (item == NULL) { \ - if (mca_osc_ucx_component.ucp_worker != NULL && \ - mca_osc_ucx_component.num_incomplete_req_ops > 0) { \ - ucp_worker_progress(mca_osc_ucx_component.ucp_worker); \ + if (mca_osc_ucx_component.num_incomplete_req_ops > 0) { \ + opal_common_ucx_workers_progress(mca_osc_ucx_component.wpool); \ } \ } \ } while (item == NULL); \ diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 93dd22c0d1e..b942b259ad5 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -1474,7 +1474,7 @@ int opal_common_ucx_mem_fetch(opal_common_ucx_mem_t *mem, rem_addr, rkey, winfo->worker); if (status != UCS_OK) { - MCA_COMMON_UCX_VERBOSE(1, "ucp_atomic_cswap64 failed: %d", status); + MCA_COMMON_UCX_VERBOSE(1, "opal_common_ucx_atomic_fetch failed: %d", status); rc = OPAL_ERROR; } else { DBG_OUT("opal_common_ucx_mem_fetch(after opal_common_ucx_atomic_fetch): ep = %p, rkey = %p\n", @@ -1485,6 +1485,35 @@ int opal_common_ucx_mem_fetch(opal_common_ucx_mem_t *mem, return rc; } +OPAL_DECLSPEC +int opal_common_ucx_mem_fetch_nb(opal_common_ucx_mem_t *mem, + ucp_atomic_fetch_op_t opcode, + uint64_t value, + int target, void *buffer, size_t len, + uint64_t rem_addr, ucs_status_ptr_t *ptr) +{ + ucp_ep_h ep = NULL; + ucp_rkey_h rkey = NULL; + _worker_info_t *winfo = NULL; + int rc = OPAL_SUCCESS; + + rc =_tlocal_fetch(mem, target, &ep, &rkey, &winfo); + if(OPAL_SUCCESS != rc){ + MCA_COMMON_UCX_VERBOSE(1, "tlocal_fetch failed: %d", rc); + return rc; + } + + /* Perform the operation */ + opal_mutex_lock(&winfo->mutex); + (*ptr) = opal_common_ucx_atomic_fetch_nb(ep, opcode, value, + buffer, len, + rem_addr, rkey, + winfo->worker); + opal_mutex_unlock(&winfo->mutex); + + return rc; +} + OPAL_DECLSPEC int opal_common_ucx_mem_post(opal_common_ucx_mem_t *mem, @@ -1580,3 +1609,9 @@ int opal_common_ucx_workers_progress(opal_common_ucx_wpool_t *wpool) { return OPAL_SUCCESS; } + +OPAL_DECLSPEC int +opal_common_ucx_mem_fence(opal_common_ucx_mem_t *mem) { + /* TODO */ + return OPAL_SUCCESS; +} diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 89966f08273..e264e315fca 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -105,7 +105,7 @@ typedef struct { ucp_worker_h recv_worker; ucp_address_t *recv_waddr; size_t recv_waddr_len; - int cur_ctxid, cur_memid; + opal_atomic_int32_t cur_ctxid, cur_memid; opal_list_t tls_list; } opal_common_ucx_wpool_t; @@ -169,6 +169,12 @@ OPAL_DECLSPEC int opal_common_ucx_mem_create(opal_common_ucx_ctx_t *ctx, int com OPAL_DECLSPEC int opal_common_ucx_mem_flush(opal_common_ucx_mem_t *mem, opal_common_ucx_flush_scope_t scope, int target); +OPAL_DECLSPEC int opal_common_ucx_mem_fetch_nb(opal_common_ucx_mem_t *mem, + ucp_atomic_fetch_op_t opcode, + uint64_t value, + int target, void *buffer, size_t len, + uint64_t rem_addr, ucs_status_ptr_t *ptr); +OPAL_DECLSPEC int opal_common_ucx_mem_fence(opal_common_ucx_mem_t *mem); OPAL_DECLSPEC int opal_common_ucx_workers_progress(opal_common_ucx_wpool_t *wpool); OPAL_DECLSPEC int opal_common_ucx_mem_cmpswp(opal_common_ucx_mem_t *mem, uint64_t compare, uint64_t value, @@ -304,6 +310,16 @@ int opal_common_ucx_worker_flush(ucp_worker_h worker) #endif } +static inline +ucs_status_ptr_t opal_common_ucx_atomic_fetch_nb(ucp_ep_h ep, ucp_atomic_fetch_op_t opcode, + uint64_t value, void *result, size_t op_size, + uint64_t remote_addr, ucp_rkey_h rkey, + ucp_worker_h worker) +{ + return ucp_atomic_fetch_nb(ep, opcode, value, result, op_size, + remote_addr, rkey, opal_common_ucx_empty_complete_cb); +} + static inline int opal_common_ucx_atomic_fetch(ucp_ep_h ep, ucp_atomic_fetch_op_t opcode, uint64_t value, void *result, size_t op_size, @@ -312,8 +328,8 @@ int opal_common_ucx_atomic_fetch(ucp_ep_h ep, ucp_atomic_fetch_op_t opcode, { ucs_status_ptr_t request; - request = ucp_atomic_fetch_nb(ep, opcode, value, result, op_size, - remote_addr, rkey, opal_common_ucx_empty_complete_cb); + request = opal_common_ucx_atomic_fetch_nb(ep, opcode, value, result, op_size, + remote_addr, rkey, worker); return opal_common_ucx_wait_request(request, worker, "ucp_atomic_fetch_nb"); } From e553e248a3e3570e480bb40ce71f6e143a529479 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Tue, 20 Nov 2018 18:10:31 -0800 Subject: [PATCH 59/59] fixes on osc code --- ompi/mca/osc/ucx/osc_ucx_active_target.c | 141 +++++++++--------- ompi/mca/osc/ucx/osc_ucx_passive_target.c | 168 ++++++++-------------- 2 files changed, 127 insertions(+), 182 deletions(-) diff --git a/ompi/mca/osc/ucx/osc_ucx_active_target.c b/ompi/mca/osc/ucx/osc_ucx_active_target.c index c0271dfbfbe..be69d209776 100644 --- a/ompi/mca/osc/ucx/osc_ucx_active_target.c +++ b/ompi/mca/osc/ucx/osc_ucx_active_target.c @@ -59,9 +59,8 @@ static inline void ompi_osc_ucx_handle_incoming_post(ompi_osc_ucx_module_t *modu } int ompi_osc_ucx_fence(int assert, struct ompi_win_t *win) { - /* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; - int ret; + int ret = OMPI_SUCCESS; if (module->epoch_type.access != NONE_EPOCH && module->epoch_type.access != FENCE_EPOCH) { @@ -75,7 +74,7 @@ int ompi_osc_ucx_fence(int assert, struct ompi_win_t *win) { } if (!(assert & MPI_MODE_NOPRECEDE)) { - ret = opal_common_ucx_worker_flush(mca_osc_ucx_component.ucp_worker); + ret = opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0/*ignore*/); if (ret != OMPI_SUCCESS) { return ret; } @@ -87,13 +86,9 @@ int ompi_osc_ucx_fence(int assert, struct ompi_win_t *win) { return module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module); - */ - return OMPI_SUCCESS; } int ompi_osc_ucx_start(struct ompi_group_t *group, int assert, struct ompi_win_t *win) { - - /* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; int i, size, *ranks_in_grp = NULL, *ranks_in_win_grp = NULL; ompi_group_t *win_group = NULL; @@ -131,7 +126,7 @@ int ompi_osc_ucx_start(struct ompi_group_t *group, int assert, struct ompi_win_t if ((assert & MPI_MODE_NOCHECK) == 0) { ompi_osc_ucx_pending_post_t *pending_post, *next; - // first look through the pending list + /* first look through the pending list */ OPAL_LIST_FOREACH_SAFE(pending_post, next, &module->pending_posts, ompi_osc_ucx_pending_post_t) { for (i = 0; i < size; i++) { if (pending_post->rank == ranks_in_win_grp[i]) { @@ -143,7 +138,7 @@ int ompi_osc_ucx_start(struct ompi_group_t *group, int assert, struct ompi_win_t } } - // waiting for the rest post requests to come + /* waiting for the rest post requests to come */ while (module->post_count != size) { for (i = 0; i < OMPI_OSC_UCX_POST_PEER_MAX; i++) { if (0 == module->state.post_state[i]) { @@ -152,7 +147,7 @@ int ompi_osc_ucx_start(struct ompi_group_t *group, int assert, struct ompi_win_t ompi_osc_ucx_handle_incoming_post(module, &(module->state.post_state[i]), ranks_in_win_grp, size); } - ucp_worker_progress(mca_osc_ucx_component.ucp_worker); + opal_common_ucx_workers_progress(mca_osc_ucx_component.wpool); } module->post_count = 0; @@ -164,14 +159,10 @@ int ompi_osc_ucx_start(struct ompi_group_t *group, int assert, struct ompi_win_t module->start_grp_ranks = ranks_in_win_grp; return ret; - */ - return OMPI_SUCCESS; } int ompi_osc_ucx_complete(struct ompi_win_t *win) { - /* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; - ucs_status_t status; int i, size; int ret = OMPI_SUCCESS; @@ -181,29 +172,30 @@ int ompi_osc_ucx_complete(struct ompi_win_t *win) { module->epoch_type.access = NONE_EPOCH; - ret = opal_common_ucx_worker_flush(mca_osc_ucx_component.ucp_worker); + ret = opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0/*ignore*/); if (ret != OMPI_SUCCESS) { return ret; } + module->global_ops_num = 0; memset(module->per_target_ops_nums, 0, sizeof(int) * ompi_comm_size(module->comm)); size = ompi_group_size(module->start_group); for (i = 0; i < size; i++) { - uint64_t remote_addr = (module->state_info_array)[module->start_grp_ranks[i]].addr + OSC_UCX_STATE_COMPLETE_COUNT_OFFSET; // write to state.complete_count on remote side - ucp_rkey_h rkey = (module->state_info_array)[module->start_grp_ranks[i]].rkey; - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, module->start_grp_ranks[i]); - - status = ucp_atomic_post(ep, UCP_ATOMIC_POST_OP_ADD, 1, - sizeof(uint64_t), remote_addr, rkey); - if (status != UCS_OK) { - OSC_UCX_VERBOSE(1, "ucp_atomic_post failed: %d", status); + uint64_t remote_addr = module->state_addrs[module->start_grp_ranks[i]] + OSC_UCX_STATE_COMPLETE_COUNT_OFFSET; // write to state.complete_count on remote side + + ret = opal_common_ucx_mem_post(module->mem, UCP_ATOMIC_POST_OP_ADD, + 1, module->start_grp_ranks[i], sizeof(uint64_t), + remote_addr); + if (ret != OMPI_SUCCESS) { + OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_post failed: %d", ret); } - ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker); - if (OMPI_SUCCESS != ret) { - OSC_UCX_VERBOSE(1, "opal_common_ucx_ep_flush failed: %d", ret); + ret = opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, + module->start_grp_ranks[i]); + if (ret != OMPI_SUCCESS) { + return ret; } } @@ -212,12 +204,9 @@ int ompi_osc_ucx_complete(struct ompi_win_t *win) { free(module->start_grp_ranks); return ret; - */ - return OMPI_SUCCESS; } int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t *win) { - /* ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; int ret = OMPI_SUCCESS; @@ -254,32 +243,36 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t } for (i = 0; i < size; i++) { - uint64_t remote_addr = (module->state_info_array)[ranks_in_win_grp[i]].addr + OSC_UCX_STATE_POST_INDEX_OFFSET; // write to state.post_index on remote side - ucp_rkey_h rkey = (module->state_info_array)[ranks_in_win_grp[i]].rkey; - ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, ranks_in_win_grp[i]); + uint64_t remote_addr = module->state_addrs[ranks_in_win_grp[i]] + OSC_UCX_STATE_POST_INDEX_OFFSET; // write to state.post_index on remote side uint64_t curr_idx = 0, result = 0; - // do fop first to get an post index - opal_common_ucx_atomic_fetch(ep, UCP_ATOMIC_FETCH_OP_FADD, 1, - &result, sizeof(result), - remote_addr, rkey, mca_osc_ucx_component.ucp_worker); + /* do fop first to get an post index */ + ret = opal_common_ucx_mem_fetch(module->mem, UCP_ATOMIC_FETCH_OP_FADD, + 1, ranks_in_win_grp[i], &result, + sizeof(result), remote_addr); + if (ret != OMPI_SUCCESS) { + return OMPI_ERROR; + } curr_idx = result & (OMPI_OSC_UCX_POST_PEER_MAX - 1); - remote_addr = (module->state_info_array)[ranks_in_win_grp[i]].addr + OSC_UCX_STATE_POST_STATE_OFFSET + sizeof(uint64_t) * curr_idx; + remote_addr = module->state_addrs[ranks_in_win_grp[i]] + OSC_UCX_STATE_POST_STATE_OFFSET + sizeof(uint64_t) * curr_idx; - // do cas to send post message + /* do cas to send post message */ do { - opal_common_ucx_atomic_cswap(ep, 0, (uint64_t)myrank + 1, &result, - sizeof(result), remote_addr, rkey, - mca_osc_ucx_component.ucp_worker); + ret = opal_common_ucx_mem_cmpswp(module->mem, 0, result, + myrank + 1, &result, sizeof(result), + remote_addr); + if (ret != OMPI_SUCCESS) { + return OMPI_ERROR; + } if (result == 0) break; - // prevent circular wait by checking for post messages received + /* prevent circular wait by checking for post messages received */ for (j = 0; j < OMPI_OSC_UCX_POST_PEER_MAX; j++) { - // no post at this index (yet) + /* no post at this index (yet) */ if (0 == module->state.post_state[j]) { continue; } @@ -299,58 +292,56 @@ int ompi_osc_ucx_post(struct ompi_group_t *group, int assert, struct ompi_win_t module->epoch_type.exposure = POST_WAIT_EPOCH; return ret; - */ - return OMPI_SUCCESS; } int ompi_osc_ucx_wait(struct ompi_win_t *win) { -// ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; -// int size; + ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; + int size; -// if (module->epoch_type.exposure != POST_WAIT_EPOCH) { -// return OMPI_ERR_RMA_SYNC; -// } + if (module->epoch_type.exposure != POST_WAIT_EPOCH) { + return OMPI_ERR_RMA_SYNC; + } -// size = ompi_group_size(module->post_group); + size = ompi_group_size(module->post_group); -// while (module->state.complete_count != (uint64_t)size) { -// /* not sure if this is required */ -// ucp_worker_progress(mca_osc_ucx_component.ucp_worker); -// } + while (module->state.complete_count != (uint64_t)size) { + /* not sure if this is required */ + opal_common_ucx_workers_progress(mca_osc_ucx_component.wpool); + } -// module->state.complete_count = 0; + module->state.complete_count = 0; -// OBJ_RELEASE(module->post_group); -// module->post_group = NULL; + OBJ_RELEASE(module->post_group); + module->post_group = NULL; -// module->epoch_type.exposure = NONE_EPOCH; + module->epoch_type.exposure = NONE_EPOCH; return OMPI_SUCCESS; } int ompi_osc_ucx_test(struct ompi_win_t *win, int *flag) { -// ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; -// int size; + ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; + int size; -// if (module->epoch_type.exposure != POST_WAIT_EPOCH) { -// return OMPI_ERR_RMA_SYNC; -// } + if (module->epoch_type.exposure != POST_WAIT_EPOCH) { + return OMPI_ERR_RMA_SYNC; + } -// size = ompi_group_size(module->post_group); + size = ompi_group_size(module->post_group); -// opal_progress(); + opal_progress(); -// if (module->state.complete_count == (uint64_t)size) { -// OBJ_RELEASE(module->post_group); -// module->post_group = NULL; + if (module->state.complete_count == (uint64_t)size) { + OBJ_RELEASE(module->post_group); + module->post_group = NULL; -// module->state.complete_count = 0; + module->state.complete_count = 0; -// module->epoch_type.exposure = NONE_EPOCH; -// *flag = 1; -// } else { -// *flag = 0; -// } + module->epoch_type.exposure = NONE_EPOCH; + *flag = 1; + } else { + *flag = 0; + } return OMPI_SUCCESS; } diff --git a/ompi/mca/osc/ucx/osc_ucx_passive_target.c b/ompi/mca/osc/ucx/osc_ucx_passive_target.c index 33155204101..2c658a38cd4 100644 --- a/ompi/mca/osc/ucx/osc_ucx_passive_target.c +++ b/ompi/mca/osc/ucx/osc_ucx_passive_target.c @@ -21,109 +21,74 @@ OBJ_CLASS_INSTANCE(ompi_osc_ucx_lock_t, opal_object_t, NULL, NULL); static inline int start_shared(ompi_osc_ucx_module_t *module, int target) { uint64_t result_value = -1; uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET; -// ucs_status_t status; -// int ret; + int ret = OMPI_SUCCESS; while (true) { - opal_common_ucx_mem_fetch(module->state_mem, UCP_ATOMIC_FETCH_OP_FADD, 1, - target, - &result_value, sizeof(result_value), - remote_addr); - -// if (OMPI_SUCCESS != ret) { -// return ret; -// } + ret = opal_common_ucx_mem_fetch(module->state_mem, UCP_ATOMIC_FETCH_OP_FADD, 1, + target, &result_value, sizeof(result_value), + remote_addr); + if (OMPI_SUCCESS != ret) { + return ret; + } DBG_OUT("start_shared: after fadd, result_value = %d", (int)result_value); assert((int64_t)result_value >= 0); if (result_value >= TARGET_LOCK_EXCLUSIVE) { -// status = ucp_atomic_post(ep, UCP_ATOMIC_POST_OP_ADD, (-1), sizeof(uint64_t), -// remote_addr, rkey); - opal_common_ucx_mem_post(module->state_mem, - UCP_ATOMIC_POST_OP_ADD, (-1), target, - sizeof(uint64_t), - remote_addr); -// if (status != UCS_OK) { -// OSC_UCX_VERBOSE(1, "ucp_atomic_add64 failed: %d", status); -// return OMPI_ERROR; -// } + ret = opal_common_ucx_mem_post(module->state_mem, + UCP_ATOMIC_POST_OP_ADD, (-1), target, + sizeof(uint64_t), remote_addr); + if (OMPI_SUCCESS != ret) { + return ret; + } } else { break; } } - return OMPI_SUCCESS; + return ret; } static inline int end_shared(ompi_osc_ucx_module_t *module, int target) { -// ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); -// ucp_rkey_h rkey = (module->state_info_array)[target].rkey; uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET; -// ucs_status_t status; - -// status = ucp_atomic_post(ep, UCP_ATOMIC_POST_OP_ADD, (-1), sizeof(uint64_t), -// remote_addr, rkey); - opal_common_ucx_mem_post(module->state_mem, - UCP_ATOMIC_POST_OP_ADD, (-1), target, sizeof(uint64_t), - remote_addr); -// if (status != UCS_OK) { -// OSC_UCX_VERBOSE(1, "ucp_atomic_post(OP_ADD) failed: %d", status); -// return OMPI_ERROR; -// } - - return OMPI_SUCCESS; + return opal_common_ucx_mem_post(module->state_mem, UCP_ATOMIC_POST_OP_ADD, + (-1), target, sizeof(uint64_t), remote_addr); } static inline int start_exclusive(ompi_osc_ucx_module_t *module, int target) { uint64_t result_value = -1; -// ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); -// ucp_rkey_h rkey = (module->state_info_array)[target].rkey; uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET; -// ucs_status_t status; + int ret = OMPI_SUCCESS; while (result_value != TARGET_LOCK_UNLOCKED) { -// status = opal_common_ucx_atomic_cswap(ep, TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE, -// &result_value, sizeof(result_value), -// remote_addr, rkey, -// mca_osc_ucx_component.ucp_worker); - opal_common_ucx_mem_cmpswp(module->state_mem, - TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE, - target, - &result_value, sizeof(result_value), - remote_addr); - -// if (status != UCS_OK) { -// return OMPI_ERROR; -// } + ret = opal_common_ucx_mem_cmpswp(module->state_mem, + TARGET_LOCK_UNLOCKED, TARGET_LOCK_EXCLUSIVE, + target, &result_value, sizeof(result_value), + remote_addr); + if (OMPI_SUCCESS != ret) { + return ret; + } } - return OMPI_SUCCESS; + return ret; } static inline int end_exclusive(ompi_osc_ucx_module_t *module, int target) { uint64_t result_value = 0; -// ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, target); -// ucp_rkey_h rkey = (module->state_info_array)[target].rkey; uint64_t remote_addr = (module->state_addrs)[target] + OSC_UCX_STATE_LOCK_OFFSET; -// int ret; - -// ret = opal_common_ucx_atomic_fetch(ep, UCP_ATOMIC_FETCH_OP_SWAP, TARGET_LOCK_UNLOCKED, -// &result_value, sizeof(result_value), -// remote_addr, rkey, mca_osc_ucx_component.ucp_worker); + int ret = OMPI_SUCCESS; - opal_common_ucx_mem_fetch(module->state_mem, - UCP_ATOMIC_FETCH_OP_SWAP, TARGET_LOCK_UNLOCKED, - target, - &result_value, sizeof(result_value), - remote_addr); -// if (OMPI_SUCCESS != ret) { -// return ret; -// } + ret = opal_common_ucx_mem_fetch(module->state_mem, + UCP_ATOMIC_FETCH_OP_SWAP, TARGET_LOCK_UNLOCKED, + target, &result_value, sizeof(result_value), + remote_addr); + if (OMPI_SUCCESS != ret) { + return ret; + } assert(result_value >= TARGET_LOCK_EXCLUSIVE); - return OMPI_SUCCESS; + return ret; } int ompi_osc_ucx_lock(int lock_type, int target, int assert, struct ompi_win_t *win) { @@ -180,7 +145,6 @@ int ompi_osc_ucx_unlock(int target, struct ompi_win_t *win) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module; ompi_osc_ucx_lock_t *lock = NULL; int ret = OMPI_SUCCESS; -// ucp_ep_h ep; if (module->epoch_type.access != PASSIVE_EPOCH) { return OMPI_ERR_RMA_SYNC; @@ -194,9 +158,7 @@ int ompi_osc_ucx_unlock(int target, struct ompi_win_t *win) { opal_hash_table_remove_value_uint32(&module->outstanding_locks, (uint32_t)target); -// ep = OSC_UCX_GET_EP(module->comm, target); -// ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker); - opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target); + ret = opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target); if (ret != OMPI_SUCCESS) { return ret; } @@ -269,11 +231,10 @@ int ompi_osc_ucx_unlock_all(struct ompi_win_t *win) { assert(module->lock_count == 0); - //ret = opal_common_ucx_worker_flush(mca_osc_ucx_component.ucp_worker); - opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0); -// if (ret != OMPI_SUCCESS) { -// return ret; -// } + ret = opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0); + if (ret != OMPI_SUCCESS) { + return ret; + } DBG_OUT("ompi_osc_ucx_unlock_all: after flush, mem = %p\n", (void *)module->mem); @@ -295,42 +256,37 @@ int ompi_osc_ucx_unlock_all(struct ompi_win_t *win) { } int ompi_osc_ucx_sync(struct ompi_win_t *win) { + ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module; + int ret = OMPI_SUCCESS; -// ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module; -// ucs_status_t status; - -// if (module->epoch_type.access != PASSIVE_EPOCH && -// module->epoch_type.access != PASSIVE_ALL_EPOCH) { -// return OMPI_ERR_RMA_SYNC; -// } + if (module->epoch_type.access != PASSIVE_EPOCH && + module->epoch_type.access != PASSIVE_ALL_EPOCH) { + return OMPI_ERR_RMA_SYNC; + } -// opal_atomic_mb(); + opal_atomic_mb(); -// status = ucp_worker_fence(mca_osc_ucx_component.ucp_worker); -// if (status != UCS_OK) { -// OSC_UCX_VERBOSE(1, "ucp_worker_fence failed: %d", status); -// return OMPI_ERROR; -// } + ret = opal_common_ucx_mem_fence(module->mem); + if (ret != OMPI_SUCCESS) { + OSC_UCX_VERBOSE(1, "opal_common_ucx_mem_fence failed: %d", ret); + } - return OMPI_SUCCESS; + return ret; } int ompi_osc_ucx_flush(int target, struct ompi_win_t *win) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t*) win->w_osc_module; -// ucp_ep_h ep; -// int ret; + int ret = OMPI_SUCCESS; if (module->epoch_type.access != PASSIVE_EPOCH && module->epoch_type.access != PASSIVE_ALL_EPOCH) { return OMPI_ERR_RMA_SYNC; } -// ep = OSC_UCX_GET_EP(module->comm, target); -// ret = opal_common_ucx_ep_flush(ep, mca_osc_ucx_component.ucp_worker); - opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target); -// if (ret != OMPI_SUCCESS) { -// return ret; -// } + ret = opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_EP, target); + if (ret != OMPI_SUCCESS) { + return ret; + } module->global_ops_num -= module->per_target_ops_nums[target]; module->per_target_ops_nums[target] = 0; @@ -340,19 +296,17 @@ int ompi_osc_ucx_flush(int target, struct ompi_win_t *win) { int ompi_osc_ucx_flush_all(struct ompi_win_t *win) { ompi_osc_ucx_module_t *module = (ompi_osc_ucx_module_t *)win->w_osc_module; -// int ret; + int ret = OMPI_SUCCESS; if (module->epoch_type.access != PASSIVE_EPOCH && module->epoch_type.access != PASSIVE_ALL_EPOCH) { return OMPI_ERR_RMA_SYNC; } -// ret = opal_common_ucx_worker_flush(mca_osc_ucx_component.ucp_worker); - opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0); - -// if (ret != OMPI_SUCCESS) { -// return ret; -// } + ret = opal_common_ucx_mem_flush(module->mem, OPAL_COMMON_UCX_SCOPE_WORKER, 0); + if (ret != OMPI_SUCCESS) { + return ret; + } module->global_ops_num = 0; memset(module->per_target_ops_nums, 0,