Skip to content

Commit ce97090

Browse files
authored
Merge pull request #7735 from bosilca/coll/han
A hierarchical, architecture-aware collective communication module
2 parents 6304c3f + cc6432b commit ce97090

33 files changed

+6926
-119
lines changed

ompi/communicator/comm.c

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -401,11 +401,10 @@ int ompi_comm_create ( ompi_communicator_t *comm, ompi_group_t *group,
401401
/**********************************************************************/
402402
/**********************************************************************/
403403
/**********************************************************************/
404-
/*
405-
** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub).
406-
*/
407-
int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
408-
ompi_communicator_t **newcomm, bool pass_on_topo )
404+
405+
int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key,
406+
opal_info_t *info,
407+
ompi_communicator_t **newcomm, bool pass_on_topo )
409408
{
410409
int myinfo[2];
411410
int size, my_size;
@@ -611,7 +610,11 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
611610
snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d SPLIT FROM %d",
612611
newcomp->c_contextid, comm->c_contextid );
613612

614-
613+
/* Copy info if there is one */
614+
if (info) {
615+
newcomp->super.s_info = OBJ_NEW(opal_info_t);
616+
opal_info_dup(info, &(newcomp->super.s_info));
617+
}
615618

616619
/* Activate the communicator and init coll-component */
617620
rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
@@ -638,6 +641,15 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
638641
}
639642

640643

644+
/*
645+
** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub).
646+
*/
647+
int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
648+
ompi_communicator_t **newcomm, bool pass_on_topo )
649+
{
650+
return ompi_comm_split_with_info(comm, color, key, NULL, newcomm, pass_on_topo);
651+
}
652+
641653
/**********************************************************************/
642654
/**********************************************************************/
643655
/**********************************************************************/

ompi/communicator/communicator.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,21 @@ int ompi_topo_dist_graph_create_adjacent(ompi_communicator_t *old_comm,
463463
OMPI_DECLSPEC int ompi_comm_split (ompi_communicator_t *comm, int color, int key,
464464
ompi_communicator_t** newcomm, bool pass_on_topo);
465465

466+
/**
467+
* split a communicator based on color and key. Parameters
468+
* are identical to the MPI-counterpart of the function.
469+
* Similar to \see ompi_comm_split with an additional info parameter.
470+
*
471+
* @param comm: input communicator
472+
* @param color
473+
* @param key
474+
*
475+
* @
476+
*/
477+
OMPI_DECLSPEC int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key,
478+
opal_info_t *info,
479+
ompi_communicator_t **newcomm, bool pass_on_topo );
480+
466481
/**
467482
* split a communicator based on type and key. Parameters
468483
* are identical to the MPI-counterpart of the function.

ompi/group/group.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -578,3 +578,31 @@ bool ompi_group_have_remote_peers (ompi_group_t *group)
578578

579579
return false;
580580
}
581+
582+
/**
583+
* Count the number of processes on this group that share the same node as
584+
* this process.
585+
*/
586+
int ompi_group_count_local_peers (ompi_group_t *group)
587+
{
588+
int local_peers = 0;
589+
for (int i = 0 ; i < group->grp_proc_count ; ++i) {
590+
ompi_proc_t *proc = NULL;
591+
#if OMPI_GROUP_SPARSE
592+
proc = ompi_group_peer_lookup (group, i);
593+
#else
594+
proc = ompi_group_get_proc_ptr_raw (group, i);
595+
if (ompi_proc_is_sentinel (proc)) {
596+
/* the proc must be stored in the group or cached in the proc
597+
* hash table if the process resides in the local node
598+
* (see ompi_proc_complete_init) */
599+
continue;
600+
}
601+
#endif
602+
if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) {
603+
local_peers++;
604+
}
605+
}
606+
607+
return local_peers;
608+
}

ompi/group/group.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,8 +420,16 @@ static inline struct ompi_proc_t *ompi_group_peer_lookup_existing (ompi_group_t
420420
return ompi_group_get_proc_ptr (group, peer_id, false);
421421
}
422422

423+
/**
424+
* Return true if all processes in the group are not on the local node.
425+
*/
423426
bool ompi_group_have_remote_peers (ompi_group_t *group);
424427

428+
/**
429+
* Count the number of processes on the local node.
430+
*/
431+
int ompi_group_count_local_peers (ompi_group_t *group);
432+
425433
/**
426434
* Function to print the group info
427435
*/

ompi/mca/coll/adapt/coll_adapt_ibcast.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ static int send_cb(ompi_request_t * req)
178178
|| (context->con->tree->tree_nextsize > 0 && rank != context->con->root
179179
&& num_sent == context->con->tree->tree_nextsize * context->con->num_segs
180180
&& num_recv_fini == context->con->num_segs)) {
181-
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in send\n",
181+
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in send\n",
182182
ompi_comm_rank(context->con->comm)));
183183
ibcast_request_fini(context);
184184
}
@@ -306,7 +306,7 @@ static int recv_cb(ompi_request_t * req)
306306
&& num_recv_fini == context->con->num_segs)
307307
|| (context->con->tree->tree_nextsize == 0
308308
&& num_recv_fini == context->con->num_segs)) {
309-
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in recv\n",
309+
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in recv\n",
310310
ompi_comm_rank(context->con->comm)));
311311
ibcast_request_fini(context);
312312
}

ompi/mca/coll/base/coll_base_comm_select.c

Lines changed: 117 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
* and Technology (RIST). All rights reserved.
2222
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
2323
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
24+
* Copyright (c) 2020 BULL S.A.S. All rights reserved.
2425
* $COPYRIGHT$
2526
*
2627
* Additional copyrights may follow
@@ -37,27 +38,20 @@
3738
#include "mpi.h"
3839
#include "ompi/communicator/communicator.h"
3940
#include "opal/util/output.h"
41+
#include "opal/util/argv.h"
4042
#include "opal/util/show_help.h"
4143
#include "opal/class/opal_list.h"
4244
#include "opal/class/opal_object.h"
4345
#include "ompi/mca/mca.h"
4446
#include "opal/mca/base/base.h"
4547
#include "ompi/mca/coll/coll.h"
4648
#include "ompi/mca/coll/base/base.h"
47-
49+
#include "ompi/mca/coll/base/coll_base_util.h"
4850

4951
/*
50-
* Local types
52+
* Stuff for the OBJ interface
5153
*/
52-
struct avail_coll_t {
53-
opal_list_item_t super;
54-
55-
int ac_priority;
56-
mca_coll_base_module_2_3_0_t *ac_module;
57-
const char * ac_component_name;
58-
};
59-
typedef struct avail_coll_t avail_coll_t;
60-
54+
OBJ_CLASS_INSTANCE(mca_coll_base_avail_coll_t, opal_list_item_t, NULL, NULL);
6155

6256
/*
6357
* Local functions
@@ -77,12 +71,6 @@ static int query_2_0_0(const mca_coll_base_component_2_0_0_t *
7771
int *priority,
7872
mca_coll_base_module_2_3_0_t ** module);
7973

80-
/*
81-
* Stuff for the OBJ interface
82-
*/
83-
static OBJ_CLASS_INSTANCE(avail_coll_t, opal_list_item_t, NULL, NULL);
84-
85-
8674
#define COPY(module, comm, func) \
8775
do { \
8876
if (NULL != module->coll_ ## func) { \
@@ -138,11 +126,14 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
138126
/* FIX ME - Do some kind of collective operation to find a module
139127
that everyone has available */
140128

129+
/* List to store every valid module */
130+
comm->c_coll->module_list = OBJ_NEW(opal_list_t);
131+
141132
/* do the selection loop */
142133
for (item = opal_list_remove_first(selectable);
143134
NULL != item; item = opal_list_remove_first(selectable)) {
144135

145-
avail_coll_t *avail = (avail_coll_t *) item;
136+
mca_coll_base_avail_coll_t *avail = (mca_coll_base_avail_coll_t *) item;
146137

147138
/* initialize the module */
148139
ret = avail->ac_module->coll_module_enable(avail->ac_module, comm);
@@ -153,6 +144,9 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
153144
(OMPI_SUCCESS == ret ? "Enabled": "Disabled") );
154145

155146
if (OMPI_SUCCESS == ret) {
147+
/* Save every component that is initialized,
148+
* queried and enabled successfully */
149+
opal_list_append(comm->c_coll->module_list, &avail->super);
156150

157151
/* copy over any of the pointers */
158152
COPY(avail->ac_module, comm, allgather);
@@ -230,10 +224,11 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
230224
COPY(avail->ac_module, comm, neighbor_alltoallw_init);
231225

232226
COPY(avail->ac_module, comm, reduce_local);
227+
} else {
228+
/* release the original module reference and the list item */
229+
OBJ_RELEASE(avail->ac_module);
230+
OBJ_RELEASE(avail);
233231
}
234-
/* release the original module reference and the list item */
235-
OBJ_RELEASE(avail->ac_module);
236-
OBJ_RELEASE(avail);
237232
}
238233

239234
/* Done with the list from the check_components() call so release it. */
@@ -306,8 +301,8 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
306301

307302
static int avail_coll_compare (opal_list_item_t **a,
308303
opal_list_item_t **b) {
309-
avail_coll_t *acoll = (avail_coll_t *) *a;
310-
avail_coll_t *bcoll = (avail_coll_t *) *b;
304+
mca_coll_base_avail_coll_t *acoll = (mca_coll_base_avail_coll_t *) *a;
305+
mca_coll_base_avail_coll_t *bcoll = (mca_coll_base_avail_coll_t *) *b;
311306

312307
if (acoll->ac_priority > bcoll->ac_priority) {
313308
return 1;
@@ -318,6 +313,20 @@ static int avail_coll_compare (opal_list_item_t **a,
318313
return 0;
319314
}
320315

316+
static inline int
317+
component_in_argv(char **argv, const char* component_name)
318+
{
319+
if( NULL != argv ) {
320+
while( NULL != *argv ) {
321+
if( 0 == strcmp(component_name, *argv) ) {
322+
return 1;
323+
}
324+
argv++; /* move to the next argument */
325+
}
326+
}
327+
return 0;
328+
}
329+
321330
/*
322331
* For each module in the list, check and see if it wants to run, and
323332
* do the resulting priority comparison. Make a list of modules to be
@@ -327,25 +336,85 @@ static int avail_coll_compare (opal_list_item_t **a,
327336
static opal_list_t *check_components(opal_list_t * components,
328337
ompi_communicator_t * comm)
329338
{
330-
int priority;
339+
int priority, flag;
331340
const mca_base_component_t *component;
332341
mca_base_component_list_item_t *cli;
333342
mca_coll_base_module_2_3_0_t *module;
334343
opal_list_t *selectable;
335-
avail_coll_t *avail;
336-
344+
mca_coll_base_avail_coll_t *avail;
345+
char info_val[OPAL_MAX_INFO_VAL+1];
346+
char **coll_argv = NULL, **coll_exclude = NULL, **coll_include = NULL;
347+
348+
/* Check if this communicator comes with restrictions on the collective modules
349+
* it wants to use. The restrictions are consistent with the MCA parameter
350+
* to limit the collective components loaded, but it applies for each
351+
* communicator and is provided as an info key during the communicator
352+
* creation. Unlike the MCA param, this info key is used not to select
353+
* components but either to prevent components from being used or to
354+
* force a change in the component priority.
355+
*/
356+
if( NULL != comm->super.s_info) {
357+
opal_info_get(comm->super.s_info, "ompi_comm_coll_preference",
358+
sizeof(info_val), info_val, &flag);
359+
if( !flag ) {
360+
goto proceed_to_select;
361+
}
362+
coll_argv = opal_argv_split(info_val, ',');
363+
if(NULL == coll_argv) {
364+
goto proceed_to_select;
365+
}
366+
int idx2, count_include = opal_argv_count(coll_argv);
367+
/* Allocate the coll_include argv */
368+
coll_include = (char**)malloc((count_include + 1) * sizeof(char*));
369+
coll_include[count_include] = NULL; /* NULL terminated array */
370+
/* Dispatch the include/exclude in the corresponding arrays */
371+
for( int idx = 0; NULL != coll_argv[idx]; idx++ ) {
372+
if( '^' == coll_argv[idx][0] ) {
373+
coll_include[idx] = NULL; /* NULL terminated array */
374+
375+
/* Allocate the coll_exclude argv */
376+
coll_exclude = (char**)malloc((count_include - idx + 1) * sizeof(char*));
377+
/* save the exclude components */
378+
for( idx2 = idx; NULL != coll_argv[idx2]; idx2++ ) {
379+
coll_exclude[idx2 - idx] = coll_argv[idx2];
380+
}
381+
coll_exclude[idx2 - idx] = NULL; /* NULL-terminated array */
382+
coll_exclude[0] = coll_exclude[0] + 1; /* get rid of the ^ */
383+
count_include = idx;
384+
break;
385+
}
386+
coll_include[idx] = coll_argv[idx];
387+
}
388+
/* Reverse the order of the coll_inclide argv to faciliate the ordering of
389+
* the selected components reverse.
390+
*/
391+
for( idx2 = 0; idx2 < (count_include - 1); idx2++ ) {
392+
char* temp = coll_include[idx2];
393+
coll_include[idx2] = coll_include[count_include - 1];
394+
coll_include[count_include - 1] = temp;
395+
count_include--;
396+
}
397+
}
398+
proceed_to_select:
337399
/* Make a list of the components that query successfully */
338400
selectable = OBJ_NEW(opal_list_t);
339401

340402
/* Scan through the list of components */
341403
OPAL_LIST_FOREACH(cli, &ompi_coll_base_framework.framework_components, mca_base_component_list_item_t) {
342404
component = cli->cli_component;
343405

406+
/* dont bother is we have this component in the exclusion list */
407+
if( component_in_argv(coll_exclude, component->mca_component_name) ) {
408+
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
409+
"coll:base:comm_select: component disqualified: %s (due to communicator info key)",
410+
component->mca_component_name );
411+
continue;
412+
}
344413
priority = check_one_component(comm, component, &module);
345414
if (priority >= 0) {
346415
/* We have a component that indicated that it wants to run
347416
by giving us a module */
348-
avail = OBJ_NEW(avail_coll_t);
417+
avail = OBJ_NEW(mca_coll_base_avail_coll_t);
349418
avail->ac_priority = priority;
350419
avail->ac_module = module;
351420
// Point to the string so we don't have to free later
@@ -376,6 +445,27 @@ static opal_list_t *check_components(opal_list_t * components,
376445
/* Put this list in priority order */
377446
opal_list_sort(selectable, avail_coll_compare);
378447

448+
/* For all valid component reorder them not on their provided priorities but on
449+
* the order requested in the info key. As at this point the coll_include is
450+
* already ordered backward we can simply prepend the components.
451+
*/
452+
mca_coll_base_avail_coll_t *item, *item_next;
453+
OPAL_LIST_FOREACH_SAFE(item, item_next,
454+
selectable, mca_coll_base_avail_coll_t) {
455+
if( component_in_argv(coll_include, item->ac_component_name) ) {
456+
opal_list_remove_item(selectable, &item->super);
457+
opal_list_prepend(selectable, &item->super);
458+
}
459+
}
460+
461+
opal_argv_free(coll_argv);
462+
if( NULL != coll_exclude ) {
463+
free(coll_exclude);
464+
}
465+
if( NULL != coll_include ) {
466+
free(coll_include);
467+
}
468+
379469
/* All done */
380470
return selectable;
381471
}
@@ -409,7 +499,6 @@ static int check_one_component(ompi_communicator_t * comm,
409499
return priority;
410500
}
411501

412-
413502
/**************************************************************************
414503
* Query functions
415504
**************************************************************************/

0 commit comments

Comments
 (0)