Skip to content

Commit 13f58f3

Browse files
author
Thananon Patinyasakdikul
authored
Merge pull request #5274 from thananon/ofi_sep
btl/ofi: add scalable endpoint support.
2 parents 266d5b2 + dae3c94 commit 13f58f3

File tree

8 files changed

+613
-171
lines changed

8 files changed

+613
-171
lines changed

opal/mca/btl/ofi/btl_ofi.h

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,7 @@
4646
#include <rdma/fi_rma.h>
4747

4848
BEGIN_C_DECLS
49-
50-
#define MCA_BTL_OFI_MAX_MODULES 16
51-
#define MCA_BTL_OFI_MAX_WORKERS 1
49+
#define MCA_BTL_OFI_MAX_MODULES 16
5250
#define MCA_BTL_OFI_MAX_CQ_READ_ENTRIES 128
5351

5452
#define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args)
@@ -62,6 +60,26 @@ enum mca_btl_ofi_type {
6260
MCA_BTL_OFI_TYPE_TOTAL
6361
};
6462

63+
struct mca_btl_ofi_context_t {
64+
int32_t context_id;
65+
66+
/* transmit context */
67+
struct fid_ep *tx_ctx;
68+
struct fid_ep *rx_ctx;
69+
70+
/* completion queue */
71+
struct fid_cq *cq;
72+
73+
/* completion info freelist */
74+
/* We have it per context to reduce the thread contention
75+
* on the freelist. Things can get really slow. */
76+
opal_free_list_t comp_list;
77+
78+
/* for thread locking */
79+
volatile int32_t lock;
80+
};
81+
typedef struct mca_btl_ofi_context_t mca_btl_ofi_context_t;
82+
6583
/**
6684
* @brief OFI BTL module
6785
*/
@@ -74,26 +92,25 @@ struct mca_btl_ofi_module_t {
7492
struct fid_fabric *fabric;
7593
struct fid_domain *domain;
7694
struct fid_ep *ofi_endpoint;
77-
struct fid_cq *cq;
7895
struct fid_av *av;
7996

97+
int num_contexts;
98+
mca_btl_ofi_context_t *contexts;
99+
80100
char *linux_device_name;
81101

82102
/** whether the module has been fully initialized or not */
83103
bool initialized;
84104
bool use_virt_addr;
85-
86-
/** spin-lock to protect the module */
87-
volatile int32_t lock;
105+
bool is_scalable_ep;
88106

89107
int64_t outstanding_rdma;
90108

91109
/** linked list of BTL endpoints. this list is never searched so
92110
* there is no need for a complicated structure here at this time*/
93111
opal_list_t endpoints;
94112

95-
/* free lists */
96-
opal_free_list_t comp_list;
113+
opal_mutex_t module_lock;
97114

98115
/** registration cache */
99116
mca_rcache_base_module_t *rcache;
@@ -110,17 +127,14 @@ struct mca_btl_ofi_component_t {
110127

111128
/** number of TL modules */
112129
int module_count;
130+
int num_contexts_per_module;
113131
int num_cqe_read;
114132

115133
size_t namelen;
116134

117135
/** All BTL OFI modules (1 per tl) */
118136
mca_btl_ofi_module_t *modules[MCA_BTL_OFI_MAX_MODULES];
119137

120-
#if OPAL_C_HAVE__THREAD_LOCAL
121-
/** bind threads to contexts */
122-
bool bind_threads_to_contexts;
123-
#endif
124138
};
125139
typedef struct mca_btl_ofi_component_t mca_btl_ofi_component_t;
126140

@@ -151,6 +165,7 @@ struct mca_btl_ofi_completion_t {
151165

152166
struct mca_btl_base_module_t *btl;
153167
struct mca_btl_base_endpoint_t *endpoint;
168+
struct mca_btl_ofi_context_t *my_context;
154169
uint32_t type;
155170

156171
void *local_address;
@@ -269,7 +284,25 @@ int mca_btl_ofi_reg_mem (void *reg_data, void *base, size_t size,
269284
mca_rcache_base_registration_t *reg);
270285
int mca_btl_ofi_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg);
271286

287+
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context);
272288
void mca_btl_ofi_exit(void);
273289

290+
/* thread atomics */
291+
static inline bool mca_btl_ofi_context_trylock (mca_btl_ofi_context_t *context)
292+
{
293+
return (context->lock || OPAL_ATOMIC_SWAP_32(&context->lock, 1));
294+
}
295+
296+
static inline void mca_btl_ofi_context_lock(mca_btl_ofi_context_t *context)
297+
{
298+
while (mca_btl_ofi_context_trylock(context));
299+
}
300+
301+
static inline void mca_btl_ofi_context_unlock(mca_btl_ofi_context_t *context)
302+
{
303+
opal_atomic_mb();
304+
context->lock = 0;
305+
}
306+
274307
END_C_DECLS
275308
#endif

opal/mca/btl/ofi/btl_ofi_atomics.c

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
4343
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
4444
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
4545
mca_btl_ofi_completion_t *comp = NULL;
46+
mca_btl_ofi_context_t *ofi_context;
47+
48+
ofi_context = get_ofi_context(ofi_btl);
4649

4750
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
4851
fi_datatype = FI_UINT32;
@@ -51,6 +54,7 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
5154
fi_op = to_fi_op(op);
5255

5356
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
57+
ofi_context,
5458
local_address,
5559
local_handle,
5660
cbfunc, cbcontext, cbdata,
@@ -61,7 +65,7 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
6165

6266
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
6367

64-
rc = fi_fetch_atomic(ofi_btl->ofi_endpoint,
68+
rc = fi_fetch_atomic(ofi_context->tx_ctx,
6569
(void*) &comp->operand, 1, NULL, /* operand */
6670
local_address, local_handle->desc, /* results */
6771
btl_endpoint->peer_addr, /* remote addr */
@@ -77,6 +81,9 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
7781

7882
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
7983

84+
/* force a bit of progress. */
85+
mca_btl_ofi_component.super.btl_progress();
86+
8087
return OPAL_SUCCESS;
8188
}
8289

@@ -92,6 +99,9 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
9299
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
93100
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
94101
mca_btl_ofi_completion_t *comp = NULL;
102+
mca_btl_ofi_context_t *ofi_context;
103+
104+
ofi_context = get_ofi_context(ofi_btl);
95105

96106
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
97107
fi_datatype = FI_UINT32;
@@ -100,6 +110,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
100110
fi_op = to_fi_op(op);
101111

102112
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
113+
ofi_context,
103114
NULL,
104115
NULL,
105116
cbfunc, cbcontext, cbdata,
@@ -110,7 +121,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
110121

111122
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
112123

113-
rc = fi_atomic(ofi_btl->ofi_endpoint,
124+
rc = fi_atomic(ofi_context->tx_ctx,
114125
(void*) &comp->operand, 1, NULL, /* operand */
115126
btl_endpoint->peer_addr, /* remote addr */
116127
remote_address, remote_handle->rkey, /* remote buffer */
@@ -124,6 +135,7 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
124135
}
125136

126137
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
138+
mca_btl_ofi_component.super.btl_progress();
127139

128140
return OPAL_SUCCESS;
129141
}
@@ -139,12 +151,16 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
139151
mca_btl_ofi_module_t *ofi_btl = (mca_btl_ofi_module_t *) btl;
140152
mca_btl_ofi_endpoint_t *btl_endpoint = (mca_btl_ofi_endpoint_t*) endpoint;
141153
mca_btl_ofi_completion_t *comp = NULL;
154+
mca_btl_ofi_context_t *ofi_context;
155+
156+
ofi_context = get_ofi_context(ofi_btl);
142157

143158
if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) {
144159
fi_datatype = FI_UINT32;
145160
}
146161

147162
comp = mca_btl_ofi_completion_alloc(btl, endpoint,
163+
ofi_context,
148164
local_address,
149165
local_handle,
150166
cbfunc, cbcontext, cbdata,
@@ -157,7 +173,7 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
157173
remote_address = (remote_address - (uint64_t) remote_handle->base_addr);
158174

159175
/* perform atomic */
160-
rc = fi_compare_atomic(ofi_btl->ofi_endpoint,
176+
rc = fi_compare_atomic(ofi_context->tx_ctx,
161177
(void*) &comp->operand, 1, NULL,
162178
(void*) &comp->compare, NULL,
163179
local_address, local_handle->desc,
@@ -176,5 +192,8 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
176192

177193
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
178194

195+
/* force a bit of progress. */
196+
mca_btl_ofi_component.super.btl_progress();
197+
179198
return OPAL_SUCCESS;
180199
}

0 commit comments

Comments
 (0)