Skip to content

Commit 220b997

Browse files
bsergentmpierreleFlorentGermain-Bull
authored andcommitted
Coll/han Bull
* first import of Bull specific modifications to HAN * Cleaning, renaming and compilation fixing Changed all future into han. * Import BULL specific modifications in coll/tuned and coll/base * Fixed compilation issues in Han * Changed han_output to directly point to coll framework output. * The verbosity MCA parameter was removed as a duplicated of coll verbosity * Add fallback in han reduce when op cannot commute and ppn are imbalanced * Added fallback wfor han bcast when nodes do not have the same number of process * Add fallback in han scatter when ppn are imbalanced + fixed missing scatter_fn pointer in the module interface Signed-off-by: Brelle Emmanuel <emmanuel.brelle@atos.net> Co-authored-by: a700850 <pierre.lemarinier@atos.net> Co-authored-by: germainf <florent.germain@atos.net>
1 parent 182c333 commit 220b997

22 files changed

+4771
-505
lines changed

ompi/mca/coll/base/coll_base_comm_select.c

Lines changed: 19 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
* and Technology (RIST). All rights reserved.
2222
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
2323
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
24+
* Copyright (c) 2020 BULL S.A.S. All rights reserved.
2425
* $COPYRIGHT$
2526
*
2627
* Additional copyrights may follow
@@ -44,20 +45,12 @@
4445
#include "opal/mca/base/base.h"
4546
#include "ompi/mca/coll/coll.h"
4647
#include "ompi/mca/coll/base/base.h"
47-
48+
#include "ompi/mca/coll/base/coll_base_util.h"
4849

4950
/*
50-
* Local types
51+
* Stuff for the OBJ interface
5152
*/
52-
struct avail_coll_t {
53-
opal_list_item_t super;
54-
55-
int ac_priority;
56-
mca_coll_base_module_2_3_0_t *ac_module;
57-
const char * ac_component_name;
58-
};
59-
typedef struct avail_coll_t avail_coll_t;
60-
53+
OBJ_CLASS_INSTANCE(mca_coll_base_avail_coll_t, opal_list_item_t, NULL, NULL);
6154

6255
/*
6356
* Local functions
@@ -77,12 +70,6 @@ static int query_2_0_0(const mca_coll_base_component_2_0_0_t *
7770
int *priority,
7871
mca_coll_base_module_2_3_0_t ** module);
7972

80-
/*
81-
* Stuff for the OBJ interface
82-
*/
83-
static OBJ_CLASS_INSTANCE(avail_coll_t, opal_list_item_t, NULL, NULL);
84-
85-
8673
#define COPY(module, comm, func) \
8774
do { \
8875
if (NULL != module->coll_ ## func) { \
@@ -138,11 +125,14 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
138125
/* FIX ME - Do some kind of collective operation to find a module
139126
that everyone has available */
140127

128+
/* List to store every valid module */
129+
comm->c_coll->module_list = OBJ_NEW(opal_list_t);
130+
141131
/* do the selection loop */
142132
for (item = opal_list_remove_first(selectable);
143133
NULL != item; item = opal_list_remove_first(selectable)) {
144134

145-
avail_coll_t *avail = (avail_coll_t *) item;
135+
mca_coll_base_avail_coll_t *avail = (mca_coll_base_avail_coll_t *) item;
146136

147137
/* initialize the module */
148138
ret = avail->ac_module->coll_module_enable(avail->ac_module, comm);
@@ -153,6 +143,9 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
153143
(OMPI_SUCCESS == ret ? "Enabled": "Disabled") );
154144

155145
if (OMPI_SUCCESS == ret) {
146+
/* Save every component that is initialized,
147+
* queried and enabled successfully */
148+
opal_list_append(comm->c_coll->module_list, &avail->super);
156149

157150
/* copy over any of the pointers */
158151
COPY(avail->ac_module, comm, allgather);
@@ -230,10 +223,11 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
230223
COPY(avail->ac_module, comm, neighbor_alltoallw_init);
231224

232225
COPY(avail->ac_module, comm, reduce_local);
226+
} else {
227+
/* release the original module reference and the list item */
228+
OBJ_RELEASE(avail->ac_module);
229+
OBJ_RELEASE(avail);
233230
}
234-
/* release the original module reference and the list item */
235-
OBJ_RELEASE(avail->ac_module);
236-
OBJ_RELEASE(avail);
237231
}
238232

239233
/* Done with the list from the check_components() call so release it. */
@@ -306,8 +300,8 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
306300

307301
static int avail_coll_compare (opal_list_item_t **a,
308302
opal_list_item_t **b) {
309-
avail_coll_t *acoll = (avail_coll_t *) *a;
310-
avail_coll_t *bcoll = (avail_coll_t *) *b;
303+
mca_coll_base_avail_coll_t *acoll = (mca_coll_base_avail_coll_t *) *a;
304+
mca_coll_base_avail_coll_t *bcoll = (mca_coll_base_avail_coll_t *) *b;
311305

312306
if (acoll->ac_priority > bcoll->ac_priority) {
313307
return 1;
@@ -332,7 +326,7 @@ static opal_list_t *check_components(opal_list_t * components,
332326
mca_base_component_list_item_t *cli;
333327
mca_coll_base_module_2_3_0_t *module;
334328
opal_list_t *selectable;
335-
avail_coll_t *avail;
329+
mca_coll_base_avail_coll_t *avail;
336330

337331
/* Make a list of the components that query successfully */
338332
selectable = OBJ_NEW(opal_list_t);
@@ -345,7 +339,7 @@ static opal_list_t *check_components(opal_list_t * components,
345339
if (priority >= 0) {
346340
/* We have a component that indicated that it wants to run
347341
by giving us a module */
348-
avail = OBJ_NEW(avail_coll_t);
342+
avail = OBJ_NEW(mca_coll_base_avail_coll_t);
349343
avail->ac_priority = priority;
350344
avail->ac_module = module;
351345
// Point to the string so we don't have to free later

ompi/mca/coll/base/coll_base_comm_unselect.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
* and Technology (RIST). All rights reserved.
1717
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1818
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
19+
* Copyright (c) 2020 BULL S.A.S. All rights reserved.
1920
* $COPYRIGHT$
2021
*
2122
* Additional copyrights may follow
@@ -34,6 +35,7 @@
3435

3536
#include "ompi/communicator/communicator.h"
3637
#include "ompi/mca/coll/base/base.h"
38+
#include "ompi/mca/coll/base/coll_base_util.h"
3739

3840
#define CLOSE(comm, func) \
3941
do { \
@@ -50,6 +52,8 @@
5052

5153
int mca_coll_base_comm_unselect(ompi_communicator_t * comm)
5254
{
55+
opal_list_item_t *item;
56+
5357
CLOSE(comm, allgather);
5458
CLOSE(comm, allgatherv);
5559
CLOSE(comm, allreduce);
@@ -124,6 +128,17 @@ int mca_coll_base_comm_unselect(ompi_communicator_t * comm)
124128

125129
CLOSE(comm, reduce_local);
126130

131+
for (item = opal_list_remove_first(comm->c_coll->module_list);
132+
NULL != item; item = opal_list_remove_first(comm->c_coll->module_list)) {
133+
mca_coll_base_avail_coll_t *avail = (mca_coll_base_avail_coll_t *) item;
134+
135+
if(avail->ac_module) {
136+
OBJ_RELEASE(avail->ac_module);
137+
}
138+
OBJ_RELEASE(avail);
139+
}
140+
OBJ_RELEASE(comm->c_coll->module_list);
141+
127142
free(comm->c_coll);
128143
comm->c_coll = NULL;
129144

ompi/mca/coll/base/coll_base_util.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,3 +305,39 @@ static void nbc_req_cons(ompi_coll_base_nbc_request_t *req) {
305305
}
306306

307307
OBJ_CLASS_INSTANCE(ompi_coll_base_nbc_request_t, ompi_request_t, nbc_req_cons, NULL);
308+
309+
/* File reading functions */
310+
static void skiptonewline (FILE *fptr, int *fileline)
311+
{
312+
do {
313+
char val;
314+
int rc;
315+
316+
rc = fread(&val, 1, 1, fptr);
317+
if (0 == rc) return;
318+
if ((1 == rc)&&('\n' == val)) {
319+
(*fileline)++;
320+
return;
321+
}
322+
} while (1);
323+
}
324+
325+
long ompi_coll_base_file_getnext (FILE *fptr, int *fileline)
326+
{
327+
do {
328+
long val;
329+
int rc;
330+
char trash;
331+
332+
rc = fscanf(fptr, "%li", &val);
333+
if (rc == EOF) return MYEOF;
334+
if (1 == rc) return val;
335+
/* in all other cases, skip to the end */
336+
rc = fread(&trash, sizeof(char), 1, fptr);
337+
if (rc == EOF) return MYEOF;
338+
if ('\n' == trash) (*fileline)++;
339+
if ('#' == trash) {
340+
skiptonewline (fptr, fileline);
341+
}
342+
} while (1);
343+
}

ompi/mca/coll/base/coll_base_util.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,19 @@ ompi_coll_base_nbc_reserve_tags(ompi_communicator_t* comm, int32_t reserve)
8484

8585
typedef struct ompi_coll_base_nbc_request_t ompi_coll_base_nbc_request_t;
8686

87+
/*
88+
* Structure to store an available module
89+
*/
90+
struct mca_coll_base_avail_coll_t {
91+
opal_list_item_t super;
92+
93+
int ac_priority;
94+
mca_coll_base_module_t *ac_module;
95+
const char * ac_component_name;
96+
};
97+
typedef struct mca_coll_base_avail_coll_t mca_coll_base_avail_coll_t;
98+
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_avail_coll_t);
99+
87100
/**
88101
* A MPI_like function doing a send and a receive simultaneously.
89102
* If one of the communications results in a zero-byte message the
@@ -164,5 +177,9 @@ int ompi_coll_base_retain_datatypes_w( ompi_request_t *request,
164177
ompi_datatype_t * const stypes[],
165178
ompi_datatype_t * const rtypes[]);
166179

180+
/* File reading function */
181+
#define MYEOF -999
182+
long ompi_coll_base_file_getnext(FILE *fptr, int *fileline);
183+
167184
END_C_DECLS
168185
#endif /* MCA_COLL_BASE_UTIL_EXPORT_H */

ompi/mca/coll/coll.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
* and Technology (RIST). All rights reserved.
2020
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
2121
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
22+
* Copyright (c) 2020 BULL S.A.S. All rights reserved.
2223
* $COPYRIGHT$
2324
*
2425
* Additional copyrights may follow
@@ -767,6 +768,9 @@ struct mca_coll_base_comm_coll_t {
767768

768769
mca_coll_base_module_reduce_local_fn_t coll_reduce_local;
769770
mca_coll_base_module_2_3_0_t *coll_reduce_local_module;
771+
772+
/* List of modules initialized, queried and enabled */
773+
opal_list_t *module_list;
770774
};
771775
typedef struct mca_coll_base_comm_coll_t mca_coll_base_comm_coll_t;
772776

ompi/mca/coll/han/Makefile.am

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
sources = \
1313
coll_han.h \
1414
coll_han_trigger.h \
15+
coll_han_dynamic.h \
16+
coll_han_dynamic_file.h \
1517
coll_han_bcast.c \
1618
coll_han_reduce.c \
1719
coll_han_scatter.c \
@@ -21,6 +23,10 @@ coll_han_allgather.c \
2123
coll_han_component.c \
2224
coll_han_module.c \
2325
coll_han_trigger.c \
26+
coll_han_dynamic.c \
27+
coll_han_dynamic_file.c \
28+
coll_han_topo.c \
29+
coll_han_subcomms.c \
2430
coll_han_utils.c
2531

2632
# Make the output library in this directory, and name it either

0 commit comments

Comments
 (0)