Skip to content

Commit 16b49dc

Browse files
committed
A complete overhaul of the HAN code.
Among many other things: - Fix an imbalance bug in MPI_allgather - Accept more human readable configuration files. We can now specify the collective by name instead of a magic number, and the component we want to use also by name. - Add the capability to have optional arguments in the collective communication configuration file. Right now the capability exists for segment lengths, but is yet to be connected with the algorithms. - Redo the initialization of all HAN collectives. Cleanup the fallback collective support. - In case the module is unable to deliver the expected result, it will fallback executing the collective operation on another collective component. This change make the support for this fallback simpler to use. - Implement a fallback allowing a HAN module to remove itself as potential active collective module, and instead fallback to the next module in line. - Completely disable the HAN modules on error. From the moment an error is encountered they remove themselves from the communicator, and in case some other modules calls them simply behave as a pass-through. Communicator: provide ompi_comm_split_with_info to split and provide info at the same time Add ompi_comm_coll_preference info key to control collective component selection COLL HAN: use info keys instead of component-level variable to communicate topology level between abstraction layers - The info value is a comma-separated list of entries, which are chosen with decreasing priorities. This overrides the priority of the component, unless the component has disqualified itself. An entry prefixed with ^ starts the ignore-list. Any entry following this character will be ingnored during the collective component selection for the communicator. Example: "sm,libnbc,^han,adapt" gives sm the highest preference, followed by libnbc. The components han and adapt are ignored in the selection process. - Allocate a temporary buffer for all lower-level leaders (length 2 segments) - Fix the handling of MPI_IN_PLACE for gather and scatter. COLL HAN: Fix topology handling - HAN should not rely on node names to determine the ordering of ranks. Instead, use the node leaders as identifiers and short-cut if the node-leaders agree that ranks are consecutive. Also, error out if the rank distribution is imbalanced for now. Signed-off-by: Xi Luo <xluo12@vols.utk.edu> Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu> Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
1 parent 220b997 commit 16b49dc

31 files changed

+2631
-2937
lines changed

ompi/communicator/comm.c

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -401,11 +401,10 @@ int ompi_comm_create ( ompi_communicator_t *comm, ompi_group_t *group,
401401
/**********************************************************************/
402402
/**********************************************************************/
403403
/**********************************************************************/
404-
/*
405-
** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub).
406-
*/
407-
int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
408-
ompi_communicator_t **newcomm, bool pass_on_topo )
404+
405+
int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key,
406+
opal_info_t *info,
407+
ompi_communicator_t **newcomm, bool pass_on_topo )
409408
{
410409
int myinfo[2];
411410
int size, my_size;
@@ -611,7 +610,11 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
611610
snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d SPLIT FROM %d",
612611
newcomp->c_contextid, comm->c_contextid );
613612

614-
613+
/* Copy info if there is one */
614+
if (info) {
615+
newcomp->super.s_info = OBJ_NEW(opal_info_t);
616+
opal_info_dup(info, &(newcomp->super.s_info));
617+
}
615618

616619
/* Activate the communicator and init coll-component */
617620
rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
@@ -638,6 +641,15 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
638641
}
639642

640643

644+
/*
645+
** Counterpart to MPI_Comm_split. To be used within OMPI (e.g. MPI_Cart_sub).
646+
*/
647+
int ompi_comm_split( ompi_communicator_t* comm, int color, int key,
648+
ompi_communicator_t **newcomm, bool pass_on_topo )
649+
{
650+
return ompi_comm_split_with_info(comm, color, key, NULL, newcomm, pass_on_topo);
651+
}
652+
641653
/**********************************************************************/
642654
/**********************************************************************/
643655
/**********************************************************************/

ompi/communicator/communicator.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,21 @@ int ompi_topo_dist_graph_create_adjacent(ompi_communicator_t *old_comm,
463463
OMPI_DECLSPEC int ompi_comm_split (ompi_communicator_t *comm, int color, int key,
464464
ompi_communicator_t** newcomm, bool pass_on_topo);
465465

466+
/**
467+
* split a communicator based on color and key. Parameters
468+
* are identical to the MPI-counterpart of the function.
469+
* Similar to \see ompi_comm_split with an additional info parameter.
470+
*
471+
* @param comm: input communicator
472+
* @param color
473+
* @param key
474+
*
475+
* @
476+
*/
477+
OMPI_DECLSPEC int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key,
478+
opal_info_t *info,
479+
ompi_communicator_t **newcomm, bool pass_on_topo );
480+
466481
/**
467482
* split a communicator based on type and key. Parameters
468483
* are identical to the MPI-counterpart of the function.

ompi/group/group.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -578,3 +578,31 @@ bool ompi_group_have_remote_peers (ompi_group_t *group)
578578

579579
return false;
580580
}
581+
582+
/**
583+
* Count the number of processes on this group that share the same node as
584+
* this process.
585+
*/
586+
int ompi_group_count_local_peers (ompi_group_t *group)
587+
{
588+
int local_peers = 0;
589+
for (int i = 0 ; i < group->grp_proc_count ; ++i) {
590+
ompi_proc_t *proc = NULL;
591+
#if OMPI_GROUP_SPARSE
592+
proc = ompi_group_peer_lookup (group, i);
593+
#else
594+
proc = ompi_group_get_proc_ptr_raw (group, i);
595+
if (ompi_proc_is_sentinel (proc)) {
596+
/* the proc must be stored in the group or cached in the proc
597+
* hash table if the process resides in the local node
598+
* (see ompi_proc_complete_init) */
599+
continue;
600+
}
601+
#endif
602+
if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) {
603+
local_peers++;
604+
}
605+
}
606+
607+
return local_peers;
608+
}

ompi/group/group.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,8 +420,16 @@ static inline struct ompi_proc_t *ompi_group_peer_lookup_existing (ompi_group_t
420420
return ompi_group_get_proc_ptr (group, peer_id, false);
421421
}
422422

423+
/**
424+
* Return true if all processes in the group are not on the local node.
425+
*/
423426
bool ompi_group_have_remote_peers (ompi_group_t *group);
424427

428+
/**
429+
* Count the number of processes on the local node.
430+
*/
431+
int ompi_group_count_local_peers (ompi_group_t *group);
432+
425433
/**
426434
* Function to print the group info
427435
*/

ompi/mca/coll/adapt/coll_adapt_ibcast.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ static int send_cb(ompi_request_t * req)
178178
|| (context->con->tree->tree_nextsize > 0 && rank != context->con->root
179179
&& num_sent == context->con->tree->tree_nextsize * context->con->num_segs
180180
&& num_recv_fini == context->con->num_segs)) {
181-
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in send\n",
181+
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in send\n",
182182
ompi_comm_rank(context->con->comm)));
183183
ibcast_request_fini(context);
184184
}
@@ -306,7 +306,7 @@ static int recv_cb(ompi_request_t * req)
306306
&& num_recv_fini == context->con->num_segs)
307307
|| (context->con->tree->tree_nextsize == 0
308308
&& num_recv_fini == context->con->num_segs)) {
309-
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Singal in recv\n",
309+
OPAL_OUTPUT_VERBOSE((30, mca_coll_adapt_component.adapt_output, "[%d]: Signal in recv\n",
310310
ompi_comm_rank(context->con->comm)));
311311
ibcast_request_fini(context);
312312
}

ompi/mca/coll/base/coll_base_comm_select.c

Lines changed: 98 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include "mpi.h"
3939
#include "ompi/communicator/communicator.h"
4040
#include "opal/util/output.h"
41+
#include "opal/util/argv.h"
4142
#include "opal/util/show_help.h"
4243
#include "opal/class/opal_list.h"
4344
#include "opal/class/opal_object.h"
@@ -312,6 +313,20 @@ static int avail_coll_compare (opal_list_item_t **a,
312313
return 0;
313314
}
314315

316+
static inline int
317+
component_in_argv(char **argv, const char* component_name)
318+
{
319+
if( NULL != argv ) {
320+
while( NULL != *argv ) {
321+
if( 0 == strcmp(component_name, *argv) ) {
322+
return 1;
323+
}
324+
argv++; /* move to the next argument */
325+
}
326+
}
327+
return 0;
328+
}
329+
315330
/*
316331
* For each module in the list, check and see if it wants to run, and
317332
* do the resulting priority comparison. Make a list of modules to be
@@ -321,20 +336,80 @@ static int avail_coll_compare (opal_list_item_t **a,
321336
static opal_list_t *check_components(opal_list_t * components,
322337
ompi_communicator_t * comm)
323338
{
324-
int priority;
339+
int priority, flag;
325340
const mca_base_component_t *component;
326341
mca_base_component_list_item_t *cli;
327342
mca_coll_base_module_2_3_0_t *module;
328343
opal_list_t *selectable;
329344
mca_coll_base_avail_coll_t *avail;
330-
345+
char info_val[OPAL_MAX_INFO_VAL+1];
346+
char **coll_argv = NULL, **coll_exclude = NULL, **coll_include = NULL;
347+
348+
/* Check if this communicator comes with restrictions on the collective modules
349+
* it wants to use. The restrictions are consistent with the MCA parameter
350+
* to limit the collective components loaded, but it applies for each
351+
* communicator and is provided as an info key during the communicator
352+
* creation. Unlike the MCA param, this info key is used not to select
353+
* components but either to prevent components from being used or to
354+
* force a change in the component priority.
355+
*/
356+
if( NULL != comm->super.s_info) {
357+
opal_info_get(comm->super.s_info, "ompi_comm_coll_preference",
358+
sizeof(info_val), info_val, &flag);
359+
if( !flag ) {
360+
goto proceed_to_select;
361+
}
362+
coll_argv = opal_argv_split(info_val, ',');
363+
if(NULL == coll_argv) {
364+
goto proceed_to_select;
365+
}
366+
int idx2, count_include = opal_argv_count(coll_argv);
367+
/* Allocate the coll_include argv */
368+
coll_include = (char**)malloc((count_include + 1) * sizeof(char*));
369+
coll_include[count_include] = NULL; /* NULL terminated array */
370+
/* Dispatch the include/exclude in the corresponding arrays */
371+
for( int idx = 0; NULL != coll_argv[idx]; idx++ ) {
372+
if( '^' == coll_argv[idx][0] ) {
373+
coll_include[idx] = NULL; /* NULL terminated array */
374+
375+
/* Allocate the coll_exclude argv */
376+
coll_exclude = (char**)malloc((count_include - idx + 1) * sizeof(char*));
377+
/* save the exclude components */
378+
for( idx2 = idx; NULL != coll_argv[idx2]; idx2++ ) {
379+
coll_exclude[idx2 - idx] = coll_argv[idx2];
380+
}
381+
coll_exclude[idx2 - idx] = NULL; /* NULL-terminated array */
382+
coll_exclude[0] = coll_exclude[0] + 1; /* get rid of the ^ */
383+
count_include = idx;
384+
break;
385+
}
386+
coll_include[idx] = coll_argv[idx];
387+
}
388+
/* Reverse the order of the coll_inclide argv to faciliate the ordering of
389+
* the selected components reverse.
390+
*/
391+
for( idx2 = 0; idx2 < (count_include - 1); idx2++ ) {
392+
char* temp = coll_include[idx2];
393+
coll_include[idx2] = coll_include[count_include - 1];
394+
coll_include[count_include - 1] = temp;
395+
count_include--;
396+
}
397+
}
398+
proceed_to_select:
331399
/* Make a list of the components that query successfully */
332400
selectable = OBJ_NEW(opal_list_t);
333401

334402
/* Scan through the list of components */
335403
OPAL_LIST_FOREACH(cli, &ompi_coll_base_framework.framework_components, mca_base_component_list_item_t) {
336404
component = cli->cli_component;
337405

406+
/* dont bother is we have this component in the exclusion list */
407+
if( component_in_argv(coll_exclude, component->mca_component_name) ) {
408+
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
409+
"coll:base:comm_select: component disqualified: %s (due to communicator info key)",
410+
component->mca_component_name );
411+
continue;
412+
}
338413
priority = check_one_component(comm, component, &module);
339414
if (priority >= 0) {
340415
/* We have a component that indicated that it wants to run
@@ -370,6 +445,27 @@ static opal_list_t *check_components(opal_list_t * components,
370445
/* Put this list in priority order */
371446
opal_list_sort(selectable, avail_coll_compare);
372447

448+
/* For all valid component reorder them not on their provided priorities but on
449+
* the order requested in the info key. As at this point the coll_include is
450+
* already ordered backward we can simply prepend the components.
451+
*/
452+
mca_coll_base_avail_coll_t *item, *item_next;
453+
OPAL_LIST_FOREACH_SAFE(item, item_next,
454+
selectable, mca_coll_base_avail_coll_t) {
455+
if( component_in_argv(coll_include, item->ac_component_name) ) {
456+
opal_list_remove_item(selectable, &item->super);
457+
opal_list_prepend(selectable, &item->super);
458+
}
459+
}
460+
461+
opal_argv_free(coll_argv);
462+
if( NULL != coll_exclude ) {
463+
free(coll_exclude);
464+
}
465+
if( NULL != coll_include ) {
466+
free(coll_include);
467+
}
468+
373469
/* All done */
374470
return selectable;
375471
}
@@ -403,7 +499,6 @@ static int check_one_component(ompi_communicator_t * comm,
403499
return priority;
404500
}
405501

406-
407502
/**************************************************************************
408503
* Query functions
409504
**************************************************************************/

0 commit comments

Comments
 (0)