1
1
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2
2
/*
3
- * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
3
+ * Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights
4
4
* reserved.
5
5
* Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
6
6
* Copyright (c) 2014 Research Organization for Information Science
51
51
#define MCA_BTL_UGNI_MAX_DEV_HANDLES 128
52
52
53
53
/** number of rdma completion queue items to remove per progress loop */
54
- #define MCA_BTL_UGNI_COMPLETIONS_PER_LOOP 16
54
+ #define MCA_BTL_UGNI_COMPLETIONS_PER_LOOP 32
55
55
56
56
/** how often to check for connection requests */
57
57
#define MCA_BTL_UGNI_CONNECT_USEC 10
@@ -96,7 +96,7 @@ struct mca_btl_ugni_cq_t {
96
96
/** ugni CQ handle */
97
97
gni_cq_handle_t gni_handle ;
98
98
/** number of completions expected on the CQ */
99
- int32_t active_operations ;
99
+ volatile int32_t active_operations ;
100
100
};
101
101
typedef struct mca_btl_ugni_cq_t mca_btl_ugni_cq_t ;
102
102
@@ -116,6 +116,9 @@ struct mca_btl_ugni_device_t {
116
116
/** number of SMSG connections */
117
117
volatile int32_t smsg_connections ;
118
118
119
+ /** boolean indicating that the device was recently flushed */
120
+ volatile bool flushed ;
121
+
119
122
/** uGNI device handle */
120
123
gni_nic_handle_t dev_handle ;
121
124
@@ -132,10 +135,7 @@ struct mca_btl_ugni_device_t {
132
135
gni_mem_handle_t smsg_irq_mhndl ;
133
136
134
137
/** RDMA endpoint free list */
135
- opal_free_list_t endpoints ;
136
-
137
- /** post descriptors pending resources */
138
- opal_list_t pending_post ;
138
+ opal_free_list_t rdma_descs ;
139
139
};
140
140
typedef struct mca_btl_ugni_device_t mca_btl_ugni_device_t ;
141
141
@@ -162,8 +162,6 @@ typedef struct mca_btl_ugni_module_t {
162
162
opal_mutex_t eager_get_pending_lock ;
163
163
opal_list_t eager_get_pending ;
164
164
165
- opal_free_list_t post_descriptors ;
166
-
167
165
mca_mpool_base_module_t * mpool ;
168
166
opal_free_list_t smsg_mboxes ;
169
167
@@ -196,9 +194,7 @@ typedef struct mca_btl_ugni_module_t {
196
194
* this rank should be limited too */
197
195
int nlocal_procs ;
198
196
199
- volatile int active_send_count ;
200
- volatile int64_t connected_peer_count ;
201
- volatile int64_t active_rdma_count ;
197
+ volatile int32_t active_rdma_count ;
202
198
203
199
mca_rcache_base_module_t * rcache ;
204
200
} mca_btl_ugni_module_t ;
@@ -212,6 +208,10 @@ typedef struct mca_btl_ugni_component_t {
212
208
/* Maximum number of entries a completion queue can hold */
213
209
uint32_t remote_cq_size ;
214
210
uint32_t local_cq_size ;
211
+ uint32_t local_rdma_cq_size ;
212
+ /* There is a hardware limitation that hurts BTE performance
213
+ * if we submit too many BTE requests. This acts as a throttle. */
214
+ int32_t active_rdma_threshold ;
215
215
216
216
/* number of ugni modules */
217
217
uint32_t ugni_num_btls ;
@@ -221,7 +221,16 @@ typedef struct mca_btl_ugni_component_t {
221
221
size_t smsg_max_data ;
222
222
223
223
/* After this message size switch to BTE protocols */
224
- size_t ugni_fma_limit ;
224
+ long int ugni_fma_limit ;
225
+ /** FMA switchover for get */
226
+ long int ugni_fma_get_limit ;
227
+ /** FMA switchover for put */
228
+ long int ugni_fma_put_limit ;
229
+
230
+ #if OPAL_C_HAVE__THREAD_LOCAL
231
+ bool bind_threads_to_devices ;
232
+ #endif
233
+
225
234
/* Switch to get when sending above this size */
226
235
size_t ugni_smsg_limit ;
227
236
@@ -282,25 +291,30 @@ typedef struct mca_btl_ugni_component_t {
282
291
283
292
/** NIC address */
284
293
uint32_t dev_addr ;
294
+
295
+ /** MCA variable identifier for the cdm_flags variable */
296
+ int cdm_flags_id ;
285
297
} mca_btl_ugni_component_t ;
286
298
287
299
/* Global structures */
288
300
289
301
OPAL_MODULE_DECLSPEC extern mca_btl_ugni_component_t mca_btl_ugni_component ;
290
302
OPAL_MODULE_DECLSPEC extern mca_btl_ugni_module_t mca_btl_ugni_module ;
291
303
292
- /**
293
- * Get a virtual device for communication
294
- */
295
- static inline mca_btl_ugni_device_t * mca_btl_ugni_ep_get_device (mca_btl_ugni_module_t * ugni_module )
304
+ static inline uint32_t mca_btl_ugni_ep_get_device_index (mca_btl_ugni_module_t * ugni_module )
296
305
{
297
306
static volatile uint32_t device_index = (uint32_t ) 0 ;
298
- uint32_t dev_index ;
299
307
300
308
/* don't really care if the device index is atomically updated */
301
- dev_index = (device_index ++ ) & (mca_btl_ugni_component .virtual_device_count - 1 );
309
+ return opal_atomic_fetch_add_32 (& device_index , 1 ) % mca_btl_ugni_component .virtual_device_count ;
310
+ }
302
311
303
- return ugni_module -> devices + dev_index ;
312
+ /**
313
+ * Get a virtual device for communication
314
+ */
315
+ static inline mca_btl_ugni_device_t * mca_btl_ugni_ep_get_device (mca_btl_ugni_module_t * ugni_module )
316
+ {
317
+ return ugni_module -> devices + mca_btl_ugni_ep_get_device_index (ugni_module );
304
318
}
305
319
306
320
static inline int mca_btl_rc_ugni_to_opal (gni_return_t rc )
@@ -322,6 +336,9 @@ static inline int mca_btl_rc_ugni_to_opal (gni_return_t rc)
322
336
return codes [rc ];
323
337
}
324
338
339
+
340
+ int mca_btl_ugni_flush (mca_btl_base_module_t * btl , struct mca_btl_base_endpoint_t * endpoint );
341
+
325
342
/**
326
343
* BML->BTL notification of change in the process list.
327
344
*
@@ -481,6 +498,16 @@ static inline uint64_t mca_btl_ugni_proc_name_to_id (opal_process_name_t name) {
481
498
int mca_btl_ugni_spawn_progress_thread (struct mca_btl_base_module_t * btl );
482
499
int mca_btl_ugni_kill_progress_thread (void );
483
500
501
+ struct mca_btl_ugni_post_descriptor_t ;
502
+
503
+ void btl_ugni_dump_post_desc (struct mca_btl_ugni_post_descriptor_t * desc );
504
+
505
+
506
+ struct mca_btl_ugni_post_descriptor_t ;
507
+
508
+ void mca_btl_ugni_handle_rdma_completions (mca_btl_ugni_module_t * ugni_module , mca_btl_ugni_device_t * device ,
509
+ struct mca_btl_ugni_post_descriptor_t * post_desc , const int count );
510
+
484
511
/**
485
512
* Try to lock a uGNI device for exclusive access
486
513
*/
@@ -531,6 +558,58 @@ static inline intptr_t mca_btl_ugni_device_serialize (mca_btl_ugni_device_t *dev
531
558
return rc ;
532
559
}
533
560
561
+ static inline intptr_t mca_btl_ugni_device_serialize_any (mca_btl_ugni_module_t * ugni_module ,
562
+ mca_btl_ugni_device_serialize_fn_t fn , void * arg )
563
+ {
564
+ mca_btl_ugni_device_t * device ;
565
+ intptr_t rc ;
566
+
567
+ if (!opal_using_threads ()) {
568
+ return fn (ugni_module -> devices , arg );
569
+ }
570
+
571
+ #if OPAL_C_HAVE__THREAD_LOCAL
572
+ if (mca_btl_ugni_component .bind_threads_to_devices ) {
573
+ /* NTH: if we have C11 _Thread_local just go ahead and assign the devices round-robin to each
574
+ * thread. in testing this give much better performance than just picking any device */
575
+ static _Thread_local mca_btl_ugni_device_t * device_local = NULL ;
576
+
577
+ device = device_local ;
578
+ if (OPAL_UNLIKELY (NULL == device )) {
579
+ /* assign device contexts round-robin */
580
+ device_local = device = mca_btl_ugni_ep_get_device (ugni_module );
581
+ }
582
+
583
+ mca_btl_ugni_device_lock (device );
584
+ } else {
585
+ #endif
586
+ /* get the next starting index */
587
+ uint32_t device_index = mca_btl_ugni_ep_get_device_index (ugni_module );
588
+ const int device_count = mca_btl_ugni_component .virtual_device_count ;
589
+
590
+ for (int i = 0 ; i < device_count ; ++ i ) {
591
+ device = ugni_module -> devices + ((device_index + i ) % device_count );
592
+ if (!mca_btl_ugni_device_trylock (device )) {
593
+ break ;
594
+ }
595
+
596
+ device = NULL ;
597
+ }
598
+
599
+ if (NULL == device ) {
600
+ device = mca_btl_ugni_ep_get_device (ugni_module );
601
+ mca_btl_ugni_device_lock (device );
602
+ }
603
+ #if OPAL_C_HAVE__THREAD_LOCAL
604
+ }
605
+ #endif
606
+
607
+ rc = fn (device , arg );
608
+ mca_btl_ugni_device_unlock (device );
609
+
610
+ return rc ;
611
+ }
612
+
534
613
535
614
/** Number of times the progress thread has woken up */
536
615
extern unsigned int mca_btl_ugni_progress_thread_wakeups ;
0 commit comments