Skip to content

Commit dae3c94

Browse files
author
Thananon Patinyasakdikul
committed
btl/ofi: add scalable endpoint support.
This commit add support for scalable endpoint to enhance multithreaded application performance. The BTL will detect the support from ofi provider and will fallback to normal usage of scalable endpoint is not supported. NEW MCA parameters: - mca_btl_ofi_disable_sep: force the btl to not use scalable endpoint. - mca_btl_ofi_num_contexts_per_module: number of communication context to create (should be the same as number of thread). Signed-off-by: Thananon Patinyasakdikul <thananon.patinyasakdikul@intel.com>
1 parent 08cfacd commit dae3c94

File tree

8 files changed

+613
-171
lines changed

8 files changed

+613
-171
lines changed

opal/mca/btl/ofi/btl_ofi.h

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,7 @@
4646
#include <rdma/fi_rma.h>
4747

4848
BEGIN_C_DECLS
49-
50-
#define MCA_BTL_OFI_MAX_MODULES 16
51-
#define MCA_BTL_OFI_MAX_WORKERS 1
49+
#define MCA_BTL_OFI_MAX_MODULES 16
5250
#define MCA_BTL_OFI_MAX_CQ_READ_ENTRIES 128
5351

5452
#define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args)
@@ -62,6 +60,26 @@ enum mca_btl_ofi_type {
6260
MCA_BTL_OFI_TYPE_TOTAL
6361
};
6462

63+
struct mca_btl_ofi_context_t {
64+
int32_t context_id;
65+
66+
/* transmit context */
67+
struct fid_ep *tx_ctx;
68+
struct fid_ep *rx_ctx;
69+
70+
/* completion queue */
71+
struct fid_cq *cq;
72+
73+
/* completion info freelist */
74+
/* We have it per context to reduce the thread contention
75+
* on the freelist. Things can get really slow. */
76+
opal_free_list_t comp_list;
77+
78+
/* for thread locking */
79+
volatile int32_t lock;
80+
};
81+
typedef struct mca_btl_ofi_context_t mca_btl_ofi_context_t;
82+
6583
/**
6684
* @brief OFI BTL module
6785
*/
@@ -74,26 +92,25 @@ struct mca_btl_ofi_module_t {
7492
struct fid_fabric *fabric;
7593
struct fid_domain *domain;
7694
struct fid_ep *ofi_endpoint;
77-
struct fid_cq *cq;
7895
struct fid_av *av;
7996

97+
int num_contexts;
98+
mca_btl_ofi_context_t *contexts;
99+
80100
char *linux_device_name;
81101

82102
/** whether the module has been fully initialized or not */
83103
bool initialized;
84104
bool use_virt_addr;
85-
86-
/** spin-lock to protect the module */
87-
volatile int32_t lock;
105+
bool is_scalable_ep;
88106

89107
int64_t outstanding_rdma;
90108

91109
/** linked list of BTL endpoints. this list is never searched so
92110
* there is no need for a complicated structure here at this time*/
93111
opal_list_t endpoints;
94112

95-
/* free lists */
96-
opal_free_list_t comp_list;
113+
opal_mutex_t module_lock;
97114

98115
/** registration cache */
99116
mca_rcache_base_module_t *rcache;
@@ -110,17 +127,14 @@ struct mca_btl_ofi_component_t {
110127

111128
/** number of TL modules */
112129
int module_count;
130+
int num_contexts_per_module;
113131
int num_cqe_read;
114132

115133
size_t namelen;
116134

117135
/** All BTL OFI modules (1 per tl) */
118136
mca_btl_ofi_module_t *modules[MCA_BTL_OFI_MAX_MODULES];
119137

120-
#if OPAL_C_HAVE__THREAD_LOCAL
121-
/** bind threads to contexts */
122-
bool bind_threads_to_contexts;
123-
#endif
124138
};
125139
typedef struct mca_btl_ofi_component_t mca_btl_ofi_component_t;
126140

@@ -151,6 +165,7 @@ struct mca_btl_ofi_completion_t {
151165

152166
struct mca_btl_base_module_t *btl;
153167
struct mca_btl_base_endpoint_t *endpoint;
168+
struct mca_btl_ofi_context_t *my_context;
154169
uint32_t type;
155170

156171
void *local_address;
@@ -269,7 +284,25 @@ int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size,
269284
mca_rcache_base_registration_t *reg);
270285
int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg);
271286

287+
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context);
272288
void mca_btl_ofi_exit(void);
273289

290+
/* thread atomics */
291+
static inline bool mca_btl_ofi_context_trylock (mca_btl_ofi_context_t *context)
292+
{
293+
return (context->lock || OPAL_ATOMIC_SWAP_32(&context->lock, 1));
294+
}
295+
296+
static inline void mca_btl_ofi_context_lock(mca_btl_ofi_context_t *context)
297+
{
298+
while (mca_btl_ofi_context_trylock(context));
299+
}
300+
301+
static inline void mca_btl_ofi_context_unlock(mca_btl_ofi_context_t *context)
302+
{
303+
opal_atomic_mb();
304+
context->lock = 0;
305+
}
306+
274307
END_C_DECLS
275308
#endif

opal/mca/btl/ofi/btl_ofi_atomics.c

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
4343
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
4444
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
4545
mca_btl_ofi_completion_t *comp = NULL;
46+
mca_btl_ofi_context_t *ofi_context;
47+
48+
ofi_context = get_ofi_context(ofi_btl);
4649

4750
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
4851
fi_datatype = FI_UINT32;
@@ -51,6 +54,7 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
5154
fi_op = to_fi_op(op);
5255

5356
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
57+
ofi_context,
5458
local_address,
5559
local_handle,
5660
cbfunc, cbcontext, cbdata,
@@ -61,7 +65,7 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
6165

6266
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
6367

64-
rc = fi_fetch_atomic(ofi_btl->ofi_endpoint,
68+
rc = fi_fetch_atomic(ofi_context->tx_ctx,
6569
(void*) &comp->operand, 1, NULL, /* operand */
6670
local_address, local_handle->desc, /* results */
6771
btl_endpoint->peer_addr, /* remote addr */
@@ -77,6 +81,9 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
7781

7882
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
7983

84+
/* force a bit of progress. */
85+
mca_btl_ofi_component.super.btl_progress();
86+
8087
return OPAL_SUCCESS;
8188
}
8289

@@ -92,6 +99,9 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
9299
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
93100
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
94101
mca_btl_ofi_completion_t *comp = NULL;
102+
mca_btl_ofi_context_t *ofi_context;
103+
104+
ofi_context = get_ofi_context(ofi_btl);
95105

96106
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
97107
fi_datatype = FI_UINT32;
@@ -100,6 +110,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
100110
fi_op = to_fi_op(op);
101111

102112
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
113+
ofi_context,
103114
NULL,
104115
NULL,
105116
cbfunc, cbcontext, cbdata,
@@ -110,7 +121,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
110121

111122
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
112123

113-
rc = fi_atomic(ofi_btl->ofi_endpoint,
124+
rc = fi_atomic(ofi_context->tx_ctx,
114125
(void*) &comp->operand, 1, NULL, /* operand */
115126
btl_endpoint->peer_addr, /* remote addr */
116127
remote_address, remote_handle->rkey, /* remote buffer */
@@ -124,6 +135,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
124135
}
125136

126137
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
138+
mca_btl_ofi_component.super.btl_progress();
127139

128140
return OPAL_SUCCESS;
129141
}
@@ -139,12 +151,16 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
139151
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
140152
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
141153
mca_btl_ofi_completion_t *comp = NULL;
154+
mca_btl_ofi_context_t *ofi_context;
155+
156+
ofi_context = get_ofi_context(ofi_btl);
142157

143158
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
144159
fi_datatype = FI_UINT32;
145160
}
146161

147162
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
163+
ofi_context,
148164
local_address,
149165
local_handle,
150166
cbfunc, cbcontext, cbdata,
@@ -157,7 +173,7 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
157173
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
158174

159175
/* perform atomic */
160-
rc = fi_compare_atomic(ofi_btl->ofi_endpoint,
176+
rc = fi_compare_atomic(ofi_context->tx_ctx,
161177
(void*) &comp->operand, 1, NULL,
162178
(void*) &comp->compare, NULL,
163179
local_address, local_handle->desc,
@@ -176,5 +192,8 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
176192

177193
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
178194

195+
/* force a bit of progress. */
196+
mca_btl_ofi_component.super.btl_progress();
197+
179198
return OPAL_SUCCESS;
180199
}

0 commit comments

Comments
 (0)