Skip to content

Commit 0396a1f

Browse files
committed
Re-factor opal_show_help() to use PMIx_Log().
This change passes in the file and topic as key/value pairs to PMIx_Log(), which will then aggregagate and de-duplicate these messages. The de-duplication code was moved from PRRTE to PMIx. This change is 'on' by default for mpirun/prterun jobs. It can be controlled via the mca parameter `opal_base_help_aggregate`. Signed-off-by: Austen Lauria <awlauria@us.ibm.com>
1 parent f3384bf commit 0396a1f

File tree

1 file changed

+77
-7
lines changed

1 file changed

+77
-7
lines changed

opal/util/show_help.c

Lines changed: 77 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ static const char *dash_line
4848
= "--------------------------------------------------------------------------\n";
4949
static int output_stream = -1;
5050
static char **search_dirs = NULL;
51+
static bool opal_help_want_aggregate = true;
5152

5253
/*
5354
* Local functions
@@ -58,13 +59,28 @@ static int opal_show_help_internal(const char *filename, const char *topic, int
5859
...);
5960
static void opal_show_help_finalize(void);
6061

62+
typedef struct {
63+
pmix_info_t *info;
64+
pmix_info_t *dirs;
65+
char *msg;
66+
} opal_log_info_t;
67+
6168
opal_show_help_fn_t opal_show_help = opal_show_help_internal;
6269
opal_show_vhelp_fn_t opal_show_vhelp = opal_show_vhelp_internal;
6370

6471
int opal_show_help_init(void)
6572
{
6673
opal_output_stream_t lds;
6774

75+
opal_help_want_aggregate = true;
76+
mca_base_var_register("opal", NULL, "base", "help_aggregate",
77+
"If opal_base_help_aggregate is true, duplicate help messages will be aggregated rather "
78+
"than displayed individually. This can be helpful for parallel jobs that experience "
79+
"multiple identical failures; rather than print out the same help/failure message N times, "
80+
"display it once with a count of how many processes sent the same message. Default: true.",
81+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9,
82+
MCA_BASE_VAR_SCOPE_LOCAL, &opal_help_want_aggregate);
83+
6884
OBJ_CONSTRUCT(&lds, opal_output_stream_t);
6985
lds.lds_want_stderr = true;
7086
output_stream = opal_output_open(&lds);
@@ -88,6 +104,58 @@ static void opal_show_help_finalize(void)
88104
}
89105
}
90106

107+
static void opal_show_help_cbfunc(pmix_status_t status, void *cbdata)
108+
{
109+
opal_log_info_t *info = (opal_log_info_t *) cbdata;
110+
if(PMIX_SUCCESS != status && PMIX_OPERATION_SUCCEEDED != status) {
111+
// Aggregation/de-duplication functionality is *probably* lost,
112+
// but let's print the error anyway since duplicate error messages
113+
// is better than hiding it.
114+
opal_output(output_stream, "%s", info->msg);
115+
}
116+
PMIX_INFO_DESTRUCT(info->info);
117+
if(info->dirs) {
118+
PMIX_INFO_DESTRUCT(info->dirs);
119+
}
120+
free(info->msg);
121+
free(info);
122+
}
123+
124+
static void local_delivery(const char *file, const char *topic, char *msg) {
125+
pmix_info_t *info, *dirs;
126+
int ninfo = 0, ndirs = 0;
127+
PMIX_INFO_CREATE(info, 1);
128+
PMIX_INFO_LOAD(&info[ninfo++], PMIX_LOG_STDERR, msg, PMIX_STRING);
129+
130+
opal_log_info_t *cbdata = calloc(1, sizeof(opal_log_info_t));
131+
if(opal_help_want_aggregate) {
132+
PMIX_INFO_CREATE(dirs, 3);
133+
PMIX_INFO_LOAD(&dirs[ndirs++], PMIX_LOG_AGG, &opal_help_want_aggregate, PMIX_BOOL);
134+
PMIX_INFO_LOAD(&dirs[ndirs++], PMIX_LOG_KEY, file, PMIX_STRING);
135+
PMIX_INFO_LOAD(&dirs[ndirs++], PMIX_LOG_VAL, topic, PMIX_STRING);
136+
cbdata->dirs = dirs;
137+
}
138+
139+
cbdata->info = info;
140+
cbdata->msg = msg;
141+
142+
// PMIx and the runtime will aggregate, de-duplicate, and print this
143+
// message to stderr.
144+
pmix_status_t rc = PMIx_Log_nb(info, ninfo, dirs, ndirs, opal_show_help_cbfunc, cbdata);
145+
if(PMIX_SUCCESS != rc) {
146+
// Aggregation/de-duplication functionality is *definitely* lost,
147+
// but let's print the error anyway since duplicate error messages
148+
// is better than hiding it.
149+
opal_output(output_stream, "%s", msg);
150+
PMIX_INFO_DESTRUCT(info);
151+
if(opal_help_want_aggregate) {
152+
PMIX_INFO_DESTRUCT(dirs);
153+
}
154+
free(msg);
155+
free(cbdata);
156+
}
157+
}
158+
91159
/*
92160
* Make one big string with all the lines. This isn't the most
93161
* efficient method in the world, but we're going for clarity here --
@@ -180,10 +248,12 @@ static int open_file(const char *base, const char *topic)
180248

181249
/* If we still couldn't open it, then something is wrong */
182250
if (NULL == opal_show_help_yyin) {
183-
opal_output(output_stream,
251+
char *tmp;
252+
opal_asprintf(&tmp,
184253
"%sSorry! You were supposed to get help about:\n %s\nBut I couldn't open "
185254
"the help file:\n %s. Sorry!\n%s",
186255
dash_line, topic, err_msg, dash_line);
256+
local_delivery(topic, err_msg, tmp);
187257
free(err_msg);
188258
return OPAL_ERR_NOT_FOUND;
189259
}
@@ -231,14 +301,15 @@ static int find_topic(const char *base, const char *topic)
231301
case OPAL_SHOW_HELP_PARSE_MESSAGE:
232302
break;
233303

234-
case OPAL_SHOW_HELP_PARSE_DONE:
235-
opal_output(output_stream,
304+
case OPAL_SHOW_HELP_PARSE_DONE: {
305+
char *msg;
306+
opal_asprintf(&msg,
236307
"%sSorry! You were supposed to get help about:\n %s\nfrom the file:\n "
237308
" %s\nBut I couldn't find that topic in the file. Sorry!\n%s",
238309
dash_line, topic, base, dash_line);
310+
local_delivery(topic, base, msg);
239311
return OPAL_ERR_NOT_FOUND;
240-
break;
241-
312+
}
242313
default:
243314
break;
244315
}
@@ -344,8 +415,7 @@ static int opal_show_vhelp_internal(const char *filename, const char *topic, int
344415

345416
/* If we got a single string, output it with formatting */
346417
if (NULL != output) {
347-
opal_output(output_stream, "%s", output);
348-
free(output);
418+
local_delivery(filename, topic, output);
349419
}
350420

351421
return (NULL == output) ? OPAL_ERROR : OPAL_SUCCESS;

0 commit comments

Comments
 (0)