Skip to content

Commit 07cb413

Browse files
xinzhao3artpol84
authored andcommitted
opal/common/ucx: Set of bug fixes in wpool
Signed-off-by: Xin Zhao <xinz@mellanox.com>
1 parent 344bb64 commit 07cb413

File tree

2 files changed

+23
-25
lines changed

2 files changed

+23
-25
lines changed

opal/mca/common/ucx/common_ucx_wpool.c

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -817,20 +817,19 @@ static void _common_ucx_tls_cleanup(_tlocal_table_t *tls)
817817
// Cleanup memory table
818818
size = tls->mem_tbl_size;
819819
for (i = 0; i < size; i++) {
820-
if (NULL == tls->mem_tbl[i]->gmem){
821-
continue;
820+
if (NULL != tls->mem_tbl[i]->gmem){
821+
_tlocal_mem_record_cleanup(tls->mem_tbl[i]);
822822
}
823-
_tlocal_mem_record_cleanup(tls->mem_tbl[i]);
823+
824824
free(tls->mem_tbl[i]);
825825
}
826826

827827
// Cleanup ctx table
828828
size = tls->ctx_tbl_size;
829829
for (i = 0; i < size; i++) {
830-
if (NULL == tls->ctx_tbl[i]->gctx){
831-
continue;
830+
if (NULL != tls->ctx_tbl[i]->gctx){
831+
_tlocal_ctx_record_cleanup(tls->ctx_tbl[i]);
832832
}
833-
_tlocal_ctx_record_cleanup(tls->ctx_tbl[i]);
834833
free(tls->ctx_tbl[i]);
835834
}
836835

@@ -918,7 +917,7 @@ static _tlocal_ctx_t *
918917
_tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx)
919918
{
920919
size_t i, free_idx = -1;
921-
int rc;
920+
int rc, found = 0;
922921

923922
/* Try to find available record in the TLS table
924923
* In parallel perform deferred cleanups */
@@ -929,14 +928,15 @@ _tlocal_add_ctx(_tlocal_table_t *tls, opal_common_ucx_ctx_t *ctx)
929928
_tlocal_ctx_record_cleanup(tls->ctx_tbl[i]);
930929
}
931930
}
932-
if ((NULL != tls->ctx_tbl[i]->gctx) && (0 > free_idx)) {
931+
if ((NULL == tls->ctx_tbl[i]->gctx) && !found) {
933932
/* Found clean record */
934933
free_idx = i;
934+
found = 1;
935935
}
936936
}
937937

938938
/* if needed - extend the table */
939-
if (0 > free_idx) {
939+
if (!found) {
940940
free_idx = tls->ctx_tbl_size;
941941
rc = _tlocal_tls_ctxtbl_extend(tls, 4);
942942
if (rc) {
@@ -1025,15 +1025,6 @@ _tlocal_mem_record_cleanup(_tlocal_mem_t *mem_rec)
10251025
size_t i;
10261026
WPOOL_DBG_OUT(_dbg_tls || _dbg_mem, "record=%p, is_freed = %d\n",
10271027
(void *)mem_rec, mem_rec->gmem->released);
1028-
if (mem_rec->gmem->released) {
1029-
return;
1030-
}
1031-
/* Remove myself from the memory context structure
1032-
* This may result in context release as we are using
1033-
* delayed cleanup */
1034-
_common_ucx_mem_signout(mem_rec->gmem);
1035-
WPOOL_DBG_OUT(_dbg_tls || _dbg_mem, "gmem = %p mem_rec = %p\n",
1036-
(void *)mem_rec->gmem, (void *)mem_rec);
10371028

10381029
for(i = 0; i < mem_rec->gmem->ctx->comm_size; i++) {
10391030
if (mem_rec->mem->rkeys[i]) {
@@ -1044,6 +1035,13 @@ _tlocal_mem_record_cleanup(_tlocal_mem_t *mem_rec)
10441035
}
10451036
free(mem_rec->mem->rkeys);
10461037

1038+
/* Remove myself from the memory context structure
1039+
* This may result in context release as we are using
1040+
* delayed cleanup */
1041+
_common_ucx_mem_signout(mem_rec->gmem);
1042+
WPOOL_DBG_OUT(_dbg_tls || _dbg_mem, "gmem = %p mem_rec = %p\n",
1043+
(void *)mem_rec->gmem, (void *)mem_rec);
1044+
10471045
/* Release fast-path pointers */
10481046
if (NULL != mem_rec->mem_tls_ptr) {
10491047
free(mem_rec->mem_tls_ptr);
@@ -1059,24 +1057,24 @@ static _tlocal_mem_t *_tlocal_add_mem(_tlocal_table_t *tls,
10591057
{
10601058
size_t i, free_idx = -1;
10611059
_tlocal_ctx_t *ctx_rec = NULL;
1062-
int rc = OPAL_SUCCESS;
1060+
int rc = OPAL_SUCCESS, found = 0;
10631061

10641062
/* Try to find available spot in the table */
10651063
for (i=0; i<tls->mem_tbl_size; i++) {
1066-
if (NULL == tls->mem_tbl[i]->gmem) {
1064+
if (NULL != tls->mem_tbl[i]->gmem) {
10671065
if (tls->mem_tbl[i]->gmem->released) {
10681066
/* Found a dirty record. Need to clean it first */
10691067
_tlocal_mem_record_cleanup(tls->mem_tbl[i]);
1070-
break;
10711068
}
10721069
}
1073-
if ((NULL == tls->mem_tbl[i]->gmem) && (0 > free_idx)) {
1070+
if ((NULL == tls->mem_tbl[i]->gmem) && !found) {
10741071
/* Found a clear record */
10751072
free_idx = i;
1073+
found = 1;
10761074
}
10771075
}
10781076

1079-
if (0 > free_idx){
1077+
if (!found){
10801078
free_idx = tls->mem_tbl_size;
10811079
rc = _tlocal_tls_memtbl_extend(tls, 4);
10821080
if (rc != OPAL_SUCCESS) {

opal/mca/common/ucx/common_ucx_wpool.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ typedef int (*opal_common_ucx_exchange_func_t)(void *my_info, size_t my_info_len
110110
void *metadata);
111111

112112
/* For developer use only */
113-
#define OPAL_COMMON_UCX_WPOOL_DBG
113+
//#define OPAL_COMMON_UCX_WPOOL_DBG
114114
#ifdef OPAL_COMMON_UCX_WPOOL_DBG
115115
extern __thread FILE *tls_pf;
116116
extern __thread int initialized;
@@ -168,7 +168,7 @@ static inline void opal_common_ucx_wpool_dbg_init(void)
168168
}
169169

170170
#else
171-
#define DBG_OUT(...)
171+
#define WPOOL_DBG_OUT(...)
172172
#endif
173173

174174

0 commit comments

Comments
 (0)