Skip to content

Commit b0ac627

Browse files
committed
btl/ugni: improve multi-threaded RDMA performance
This commit improves the injection rate and latency for RDMA operations. This is done by the following improvements: - If C11's _Thread_local keyword is available then always use the same virtual device index for the same thread when using RDMA. If the keyword is not available then attempt to use any device that isn't already in use. The binding support is enabled by default but can be disabled via the btl_ugni_bind_devices MCA variable. - When posting FMA and RDMA operations always attempt to reap completions after posting the operation. This allows us to better balance the work of reaping completions across all application threads. - Limit the total number of outstanding BTE transactions. This fixes a performance bug when using many threads. - Split out RDMA and local SMSG completion queue sizes. The RDMA queue size is better tuned for performance with RMA-MT. - Split out put and get FMA limits. The old btl_ugni_fma_limit MCA variable is deprecated. The new variable names are: btl_ugni_fma_put_limit and btl_ugni_fma_get_limit. - Change how post descriptors are handled. They are no longer allocated seperately from the RDMA endpoints. - Some cleanup to move error code out of the critical path. - Disable the FMA sharing flag on the CDM when we detect that there should be enough FMA descriptors for the number of virtual devices we plan will create. If the user sets this flag we will not unset it. This change should improve the small-message RMA performance by ~ 10%. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
1 parent 0ddbc75 commit b0ac627

14 files changed

+817
-592
lines changed

opal/mca/btl/ugni/btl_ugni.h

Lines changed: 99 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
3+
* Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
44
* reserved.
55
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
66
* Copyright (c) 2014 Research Organization for Information Science
@@ -51,7 +51,7 @@
5151
#define MCA_BTL_UGNI_MAX_DEV_HANDLES 128
5252

5353
/** number of rdma completion queue items to remove per progress loop */
54-
#define MCA_BTL_UGNI_COMPLETIONS_PER_LOOP 16
54+
#define MCA_BTL_UGNI_COMPLETIONS_PER_LOOP 32
5555

5656
/** how often to check for connection requests */
5757
#define MCA_BTL_UGNI_CONNECT_USEC 10
@@ -96,7 +96,7 @@ struct mca_btl_ugni_cq_t {
9696
/** ugni CQ handle */
9797
gni_cq_handle_t gni_handle;
9898
/** number of completions expected on the CQ */
99-
int32_t active_operations;
99+
volatile int32_t active_operations;
100100
};
101101
typedef struct mca_btl_ugni_cq_t mca_btl_ugni_cq_t;
102102

@@ -116,6 +116,9 @@ struct mca_btl_ugni_device_t {
116116
/** number of SMSG connections */
117117
volatile int32_t smsg_connections;
118118

119+
/** boolean indicating that the device was recently flushed */
120+
volatile bool flushed;
121+
119122
/** uGNI device handle */
120123
gni_nic_handle_t dev_handle;
121124

@@ -132,10 +135,7 @@ struct mca_btl_ugni_device_t {
132135
gni_mem_handle_t smsg_irq_mhndl;
133136

134137
/** RDMA endpoint free list */
135-
opal_free_list_t endpoints;
136-
137-
/** post descriptors pending resources */
138-
opal_list_t pending_post;
138+
opal_free_list_t rdma_descs;
139139
};
140140
typedef struct mca_btl_ugni_device_t mca_btl_ugni_device_t;
141141

@@ -162,8 +162,6 @@ typedef struct mca_btl_ugni_module_t {
162162
opal_mutex_t eager_get_pending_lock;
163163
opal_list_t eager_get_pending;
164164

165-
opal_free_list_t post_descriptors;
166-
167165
mca_mpool_base_module_t *mpool;
168166
opal_free_list_t smsg_mboxes;
169167

@@ -196,9 +194,7 @@ typedef struct mca_btl_ugni_module_t {
196194
* this rank should be limited too */
197195
int nlocal_procs;
198196

199-
volatile int active_send_count;
200-
volatile int64_t connected_peer_count;
201-
volatile int64_t active_rdma_count;
197+
volatile int32_t active_rdma_count;
202198

203199
mca_rcache_base_module_t *rcache;
204200
} mca_btl_ugni_module_t;
@@ -212,6 +208,10 @@ typedef struct mca_btl_ugni_component_t {
212208
/* Maximum number of entries a completion queue can hold */
213209
uint32_t remote_cq_size;
214210
uint32_t local_cq_size;
211+
uint32_t local_rdma_cq_size;
212+
/* There is a hardware limitation that hurts BTE performance
213+
* if we submit too many BTE requests. This acts as a throttle. */
214+
int32_t active_rdma_threshold;
215215

216216
/* number of ugni modules */
217217
uint32_t ugni_num_btls;
@@ -221,7 +221,16 @@ typedef struct mca_btl_ugni_component_t {
221221
size_t smsg_max_data;
222222

223223
/* After this message size switch to BTE protocols */
224-
size_t ugni_fma_limit;
224+
long int ugni_fma_limit;
225+
/** FMA switchover for get */
226+
long int ugni_fma_get_limit;
227+
/** FMA switchover for put */
228+
long int ugni_fma_put_limit;
229+
230+
#if OPAL_C_HAVE__THREAD_LOCAL
231+
bool bind_threads_to_devices;
232+
#endif
233+
225234
/* Switch to get when sending above this size */
226235
size_t ugni_smsg_limit;
227236

@@ -282,25 +291,30 @@ typedef struct mca_btl_ugni_component_t {
282291

283292
/** NIC address */
284293
uint32_t dev_addr;
294+
295+
/** MCA variable identifier for the cdm_flags variable */
296+
int cdm_flags_id;
285297
} mca_btl_ugni_component_t;
286298

287299
/* Global structures */
288300

289301
OPAL_MODULE_DECLSPEC extern mca_btl_ugni_component_t mca_btl_ugni_component;
290302
OPAL_MODULE_DECLSPEC extern mca_btl_ugni_module_t mca_btl_ugni_module;
291303

292-
/**
293-
* Get a virtual device for communication
294-
*/
295-
static inline mca_btl_ugni_device_t *mca_btl_ugni_ep_get_device (mca_btl_ugni_module_t *ugni_module)
304+
static inline uint32_t mca_btl_ugni_ep_get_device_index (mca_btl_ugni_module_t *ugni_module)
296305
{
297306
static volatile uint32_t device_index = (uint32_t) 0;
298-
uint32_t dev_index;
299307

300308
/* don't really care if the device index is atomically updated */
301-
dev_index = (device_index++) & (mca_btl_ugni_component.virtual_device_count - 1);
309+
return opal_atomic_fetch_add_32 (&device_index, 1) % mca_btl_ugni_component.virtual_device_count;
310+
}
302311

303-
return ugni_module->devices + dev_index;
312+
/**
313+
* Get a virtual device for communication
314+
*/
315+
static inline mca_btl_ugni_device_t *mca_btl_ugni_ep_get_device (mca_btl_ugni_module_t *ugni_module)
316+
{
317+
return ugni_module->devices + mca_btl_ugni_ep_get_device_index (ugni_module);
304318
}
305319

306320
static inline int mca_btl_rc_ugni_to_opal (gni_return_t rc)
@@ -322,6 +336,9 @@ static inline int mca_btl_rc_ugni_to_opal (gni_return_t rc)
322336
return codes[rc];
323337
}
324338

339+
340+
int mca_btl_ugni_flush (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint);
341+
325342
/**
326343
* BML->BTL notification of change in the process list.
327344
*
@@ -481,6 +498,16 @@ static inline uint64_t mca_btl_ugni_proc_name_to_id (opal_process_name_t name) {
481498
int mca_btl_ugni_spawn_progress_thread(struct mca_btl_base_module_t* btl);
482499
int mca_btl_ugni_kill_progress_thread(void);
483500

501+
struct mca_btl_ugni_post_descriptor_t;
502+
503+
void btl_ugni_dump_post_desc (struct mca_btl_ugni_post_descriptor_t *desc);
504+
505+
506+
struct mca_btl_ugni_post_descriptor_t;
507+
508+
void mca_btl_ugni_handle_rdma_completions (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device,
509+
struct mca_btl_ugni_post_descriptor_t *post_desc, const int count);
510+
484511
/**
485512
* Try to lock a uGNI device for exclusive access
486513
*/
@@ -531,6 +558,58 @@ static inline intptr_t mca_btl_ugni_device_serialize (mca_btl_ugni_device_t *dev
531558
return rc;
532559
}
533560

561+
static inline intptr_t mca_btl_ugni_device_serialize_any (mca_btl_ugni_module_t *ugni_module,
562+
mca_btl_ugni_device_serialize_fn_t fn, void *arg)
563+
{
564+
mca_btl_ugni_device_t *device;
565+
intptr_t rc;
566+
567+
if (!opal_using_threads ()) {
568+
return fn (ugni_module->devices, arg);
569+
}
570+
571+
#if OPAL_C_HAVE__THREAD_LOCAL
572+
if (mca_btl_ugni_component.bind_threads_to_devices) {
573+
/* NTH: if we have C11 _Thread_local just go ahead and assign the devices round-robin to each
574+
* thread. in testing this give much better performance than just picking any device */
575+
static _Thread_local mca_btl_ugni_device_t *device_local = NULL;
576+
577+
device = device_local;
578+
if (OPAL_UNLIKELY(NULL == device)) {
579+
/* assign device contexts round-robin */
580+
device_local = device = mca_btl_ugni_ep_get_device (ugni_module);
581+
}
582+
583+
mca_btl_ugni_device_lock (device);
584+
} else {
585+
#endif
586+
/* get the next starting index */
587+
uint32_t device_index = mca_btl_ugni_ep_get_device_index (ugni_module);
588+
const int device_count = mca_btl_ugni_component.virtual_device_count;
589+
590+
for (int i = 0 ; i < device_count ; ++i) {
591+
device = ugni_module->devices + ((device_index + i) % device_count);
592+
if (!mca_btl_ugni_device_trylock (device)) {
593+
break;
594+
}
595+
596+
device = NULL;
597+
}
598+
599+
if (NULL == device) {
600+
device = mca_btl_ugni_ep_get_device (ugni_module);
601+
mca_btl_ugni_device_lock (device);
602+
}
603+
#if OPAL_C_HAVE__THREAD_LOCAL
604+
}
605+
#endif
606+
607+
rc = fn (device, arg);
608+
mca_btl_ugni_device_unlock (device);
609+
610+
return rc;
611+
}
612+
534613

535614
/** Number of times the progress thread has woken up */
536615
extern unsigned int mca_btl_ugni_progress_thread_wakeups;

opal/mca/btl/ugni/btl_ugni_add_procs.c

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ int mca_btl_ugni_add_procs (struct mca_btl_base_module_t* btl, size_t nprocs,
7979
if (false == ugni_module->initialized) {
8080
for (int i = 0 ; i < mca_btl_ugni_component.virtual_device_count ; ++i) {
8181
mca_btl_ugni_device_t *device = ugni_module->devices + i;
82-
rc = GNI_CqCreate (device->dev_handle, mca_btl_ugni_component.local_cq_size, 0,
82+
rc = GNI_CqCreate (device->dev_handle, mca_btl_ugni_component.local_rdma_cq_size, 0,
8383
GNI_CQ_NOBLOCK, NULL, NULL, &device->dev_rdma_local_cq.gni_handle);
8484
if (GNI_RC_SUCCESS != rc) {
8585
BTL_ERROR(("error creating local BTE/FMA CQ"));
@@ -94,7 +94,7 @@ int mca_btl_ugni_add_procs (struct mca_btl_base_module_t* btl, size_t nprocs,
9494
}
9595

9696
if (mca_btl_ugni_component.progress_thread_enabled) {
97-
rc = GNI_CqCreate (device->dev_handle, mca_btl_ugni_component.local_cq_size,
97+
rc = GNI_CqCreate (device->dev_handle, mca_btl_ugni_component.local_rdma_cq_size,
9898
0, GNI_CQ_BLOCKING, NULL, NULL, &device->dev_rdma_local_irq_cq.gni_handle);
9999
if (GNI_RC_SUCCESS != rc) {
100100
BTL_ERROR(("error creating local BTE/FMA CQ"));
@@ -448,15 +448,6 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module)
448448
return rc;
449449
}
450450

451-
rc = opal_free_list_init (&ugni_module->post_descriptors,
452-
sizeof (mca_btl_ugni_post_descriptor_t),
453-
8, OBJ_CLASS(mca_btl_ugni_post_descriptor_t),
454-
0, 0, 0, -1, 256, NULL, 0, NULL, NULL, NULL);
455-
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
456-
BTL_ERROR(("error creating post descriptor free list"));
457-
return rc;
458-
}
459-
460451
return OPAL_SUCCESS;
461452
}
462453

opal/mca/btl/ugni/btl_ugni_atomic.c

Lines changed: 26 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2014-2017 Los Alamos National Security, LLC. All rights
3+
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
44
* reserved.
55
* $COPYRIGHT$
66
*
@@ -79,8 +79,8 @@ int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
7979
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
8080
{
8181
gni_mem_handle_t dummy = {0, 0};
82-
mca_btl_ugni_post_descriptor_t *post_desc;
83-
int gni_op, rc, type;
82+
mca_btl_ugni_post_descriptor_t post_desc;
83+
int gni_op, type;
8484
size_t size;
8585

8686
size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8;
@@ -95,23 +95,13 @@ int mca_btl_ugni_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
9595
return OPAL_ERR_NOT_SUPPORTED;
9696
}
9797

98-
post_desc = mca_btl_ugni_alloc_post_descriptor (endpoint, NULL, cbfunc, cbcontext, cbdata);
99-
if (OPAL_UNLIKELY(NULL == post_desc)) {
100-
return OPAL_ERR_OUT_OF_RESOURCE;
101-
}
102-
103-
init_gni_post_desc (post_desc, order, GNI_POST_AMO, 0, dummy, remote_address,
104-
remote_handle->gni_handle, size, 0);
105-
post_desc->desc.amo_cmd = gni_op;
106-
107-
post_desc->desc.first_operand = operand;
108-
109-
rc = mca_btl_ugni_endpoint_post_fma (endpoint, post_desc);
110-
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
111-
mca_btl_ugni_return_post_descriptor (post_desc);
112-
}
98+
init_post_desc (&post_desc, endpoint, order, GNI_POST_AMO, 0, dummy, remote_address,
99+
remote_handle->gni_handle, size, 0, cbfunc, cbcontext, cbdata,
100+
NULL);
101+
post_desc.gni_desc.amo_cmd = gni_op;
102+
post_desc.gni_desc.first_operand = operand;
113103

114-
return rc;
104+
return mca_btl_ugni_endpoint_post_fma (endpoint, &post_desc);
115105
}
116106

117107
int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
@@ -120,8 +110,8 @@ int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_en
120110
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
121111
void *cbcontext, void *cbdata)
122112
{
123-
mca_btl_ugni_post_descriptor_t *post_desc;
124-
int gni_op, rc, type;
113+
mca_btl_ugni_post_descriptor_t post_desc;
114+
int gni_op, type;
125115
size_t size;
126116

127117
size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8;
@@ -136,55 +126,33 @@ int mca_btl_ugni_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_en
136126
return OPAL_ERR_NOT_SUPPORTED;
137127
}
138128

139-
post_desc = mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata);
140-
if (OPAL_UNLIKELY(NULL == post_desc)) {
141-
return OPAL_ERR_OUT_OF_RESOURCE;
142-
}
143-
144-
145-
init_gni_post_desc (post_desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle,
146-
remote_address, remote_handle->gni_handle, size, 0);
147-
post_desc->desc.amo_cmd = gni_op;
148-
149-
post_desc->desc.first_operand = operand;
129+
init_post_desc (&post_desc, endpoint, order, GNI_POST_AMO, (intptr_t) local_address,
130+
local_handle->gni_handle, remote_address, remote_handle->gni_handle,
131+
size, 0, cbfunc, cbcontext, cbdata, local_handle);
132+
post_desc.gni_desc.amo_cmd = gni_op;
133+
post_desc.gni_desc.first_operand = operand;
150134

151-
rc = mca_btl_ugni_endpoint_post_fma (endpoint, post_desc);
152-
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
153-
mca_btl_ugni_return_post_descriptor (post_desc);
154-
}
155-
156-
return rc;
135+
return mca_btl_ugni_endpoint_post_fma (endpoint, &post_desc);
157136
}
158137

159138
int mca_btl_ugni_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
160139
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
161140
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
162141
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
163142
{
164-
mca_btl_ugni_post_descriptor_t *post_desc;
165-
int gni_op, rc;
143+
mca_btl_ugni_post_descriptor_t post_desc;
166144
size_t size;
145+
int gni_op;
167146

168147
gni_op = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? GNI_FMA_ATOMIC2_CSWAP_S : GNI_FMA_ATOMIC_CSWAP;
169148
size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8;
170149

171-
post_desc = mca_btl_ugni_alloc_post_descriptor (endpoint, local_handle, cbfunc, cbcontext, cbdata);
172-
if (OPAL_UNLIKELY(NULL == post_desc)) {
173-
return OPAL_ERR_OUT_OF_RESOURCE;
174-
}
175-
176-
177-
init_gni_post_desc (post_desc, order, GNI_POST_AMO, (intptr_t) local_address, local_handle->gni_handle,
178-
remote_address, remote_handle->gni_handle, size, 0);
179-
post_desc->desc.amo_cmd = gni_op;
180-
181-
post_desc->desc.first_operand = compare;
182-
post_desc->desc.second_operand = value;
183-
184-
rc = mca_btl_ugni_endpoint_post_fma (endpoint, post_desc);
185-
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
186-
mca_btl_ugni_return_post_descriptor (post_desc);
187-
}
150+
init_post_desc (&post_desc, endpoint, order, GNI_POST_AMO, (intptr_t) local_address,
151+
local_handle->gni_handle, remote_address, remote_handle->gni_handle, size, 0,
152+
cbfunc, cbcontext, cbdata, local_handle);
153+
post_desc.gni_desc.amo_cmd = gni_op;
154+
post_desc.gni_desc.first_operand = compare;
155+
post_desc.gni_desc.second_operand = value;
188156

189-
return rc;
157+
return mca_btl_ugni_endpoint_post_fma (endpoint, &post_desc);
190158
}

0 commit comments

Comments
 (0)