Skip to content

Commit 628ccef

Browse files
committed
pml/ob1: make no. of events an mca parameter
make the number of events created by the pml/ob1 component an mca paramter. The error message in case we run out of events already suggested to increase a particular mca parameter, but a) I couldn't find that mca parameter, and b) it was definitily not used. Signed-off-by: Edgar Gabriel <Edgar.Gabriel@amd.com>
1 parent c29f239 commit 628ccef

File tree

3 files changed

+22
-14
lines changed

3 files changed

+22
-14
lines changed

ompi/mca/pml/ob1/pml_ob1.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ typedef struct mca_pml_ob1_t mca_pml_ob1_t;
9494
extern mca_pml_ob1_t mca_pml_ob1;
9595
extern int mca_pml_ob1_output;
9696
extern bool mca_pml_ob1_matching_protection;
97+
extern int mca_pml_ob1_accelerator_events_max;
98+
9799
/*
98100
* PML interface functions.
99101
*/

ompi/mca/pml/ob1/pml_ob1_accelerator.c

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ static int accelerator_event_dtoh_first_used, accelerator_event_htod_first_used;
7272
static volatile int accelerator_event_dtoh_num_used, accelerator_event_htod_num_used;
7373

7474
/* Size of array holding events */
75-
static int accelerator_event_max = 400;
7675
static int accelerator_event_htod_most = 0;
7776

7877
int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *frag)
@@ -87,9 +86,9 @@ int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *f
8786
* return an error. The error message will tell the user to try and
8887
* run again, but with a larger array for storing events. */
8988
OPAL_THREAD_LOCK(&pml_ob1_accelerator_htod_lock);
90-
if (accelerator_event_htod_num_used == accelerator_event_max) {
91-
opal_output_verbose(1, mca_pml_ob1_output, "Out of event handles. Max: %d. Suggested to rerun with new max with --mca mpi_common_accelerator_event_max %d.",
92-
accelerator_event_max, accelerator_event_max + 100);
89+
if (accelerator_event_htod_num_used == mca_pml_ob1_accelerator_events_max) {
90+
opal_output_verbose(1, mca_pml_ob1_output, "Out of event handles. Max: %d. Suggested to rerun with new max with --mca pml_ob1_accelerator_events_max %d.",
91+
mca_pml_ob1_accelerator_events_max, mca_pml_ob1_accelerator_events_max + 100);
9392
OPAL_THREAD_UNLOCK(&pml_ob1_accelerator_htod_lock);
9493
return OPAL_ERR_OUT_OF_RESOURCE;
9594
}
@@ -113,7 +112,7 @@ int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *f
113112

114113
/* Bump up the first available slot and number used by 1 */
115114
accelerator_event_htod_first_avail++;
116-
if (accelerator_event_htod_first_avail >= accelerator_event_max) {
115+
if (accelerator_event_htod_first_avail >= mca_pml_ob1_accelerator_events_max) {
117116
accelerator_event_htod_first_avail = 0;
118117
}
119118
accelerator_event_htod_num_used++;
@@ -169,7 +168,7 @@ int mca_pml_ob1_progress_one_htod_event(struct mca_btl_base_descriptor_t **frag)
169168
/* Bump counters, loop around the circular buffer if necessary */
170169
--accelerator_event_htod_num_used;
171170
++accelerator_event_htod_first_used;
172-
if (accelerator_event_htod_first_used >= accelerator_event_max) {
171+
if (accelerator_event_htod_first_used >= mca_pml_ob1_accelerator_events_max) {
173172
accelerator_event_htod_first_used = 0;
174173
}
175174
/* A return value of 1 indicates an event completed and a frag was returned */
@@ -214,15 +213,15 @@ int mca_pml_ob1_accelerator_init(void)
214213
accelerator_event_dtoh_first_avail = 0;
215214
accelerator_event_dtoh_first_used = 0;
216215

217-
accelerator_event_dtoh_array = calloc(accelerator_event_max, sizeof(opal_accelerator_event_t *));
216+
accelerator_event_dtoh_array = calloc(mca_pml_ob1_accelerator_events_max, sizeof(opal_accelerator_event_t *));
218217
if (NULL == accelerator_event_dtoh_array) {
219218
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
220219
rc = OPAL_ERROR;
221220
goto cleanup_and_error;
222221
}
223222

224223
/* Create the events since they can be reused. */
225-
for (i = 0; i < accelerator_event_max; i++) {
224+
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
226225
result = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_dtoh_array[i], false);
227226
if (OPAL_SUCCESS != result) {
228227
opal_output_verbose(1, mca_pml_ob1_output, "Accelerator create event failed.");
@@ -234,7 +233,7 @@ int mca_pml_ob1_accelerator_init(void)
234233
/* The first available status index is 0. Make an empty frag
235234
array. */
236235
accelerator_event_dtoh_frag_array = (struct mca_btl_base_descriptor_t **) malloc(
237-
sizeof(struct mca_btl_base_descriptor_t *) * accelerator_event_max);
236+
sizeof(struct mca_btl_base_descriptor_t *) * mca_pml_ob1_accelerator_events_max);
238237
if (NULL == accelerator_event_dtoh_frag_array) {
239238
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
240239
rc = OPAL_ERROR;
@@ -247,15 +246,15 @@ int mca_pml_ob1_accelerator_init(void)
247246
accelerator_event_htod_first_avail = 0;
248247
accelerator_event_htod_first_used = 0;
249248

250-
accelerator_event_htod_array = calloc(accelerator_event_max, sizeof(opal_accelerator_event_t *));
249+
accelerator_event_htod_array = calloc(mca_pml_ob1_accelerator_events_max, sizeof(opal_accelerator_event_t *));
251250
if (NULL == accelerator_event_htod_array) {
252251
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
253252
rc = OPAL_ERROR;
254253
goto cleanup_and_error;
255254
}
256255

257256
/* Create the events since they can be reused. */
258-
for (i = 0; i < accelerator_event_max; i++) {
257+
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
259258
result = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_htod_array[i], false);
260259
if (OPAL_SUCCESS != result) {
261260
opal_output_verbose(1, mca_pml_ob1_output, "Accelerator create event failed.");
@@ -267,7 +266,7 @@ int mca_pml_ob1_accelerator_init(void)
267266
/* The first available status index is 0. Make an empty frag
268267
array. */
269268
accelerator_event_htod_frag_array = (struct mca_btl_base_descriptor_t **) malloc(
270-
sizeof(struct mca_btl_base_descriptor_t *) * accelerator_event_max);
269+
sizeof(struct mca_btl_base_descriptor_t *) * mca_pml_ob1_accelerator_events_max);
271270
if (NULL == accelerator_event_htod_frag_array) {
272271
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
273272
rc = OPAL_ERROR;
@@ -304,7 +303,7 @@ void mca_pml_ob1_accelerator_fini(void)
304303
}
305304

306305
if (NULL != accelerator_event_htod_array) {
307-
for (i = 0; i < accelerator_event_max; i++) {
306+
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
308307
if (NULL != accelerator_event_htod_array[i]) {
309308
OBJ_RELEASE(accelerator_event_htod_array[i]);
310309
}
@@ -313,7 +312,7 @@ void mca_pml_ob1_accelerator_fini(void)
313312
}
314313

315314
if (NULL != accelerator_event_dtoh_array) {
316-
for (i = 0; i < accelerator_event_max; i++) {
315+
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
317316
if (NULL != accelerator_event_dtoh_array[i]) {
318317
OBJ_RELEASE(accelerator_event_dtoh_array[i]);
319318
}

ompi/mca/pml/ob1/pml_ob1_component.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ static int mca_pml_ob1_component_fini(void);
6262
int mca_pml_ob1_output = 0;
6363
static int mca_pml_ob1_verbose = 0;
6464
bool mca_pml_ob1_matching_protection = false;
65+
int mca_pml_ob1_accelerator_events_max = 400;
6566

6667
mca_pml_base_component_2_1_0_t mca_pml_ob1_component = {
6768
/* First, the mca_base_component_t struct containing meta
@@ -242,6 +243,12 @@ static int mca_pml_ob1_component_register(void)
242243
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
243244
mca_pml_ob1_get_posted_recvq_size, NULL, mca_pml_ob1_comm_size_notify, NULL);
244245

246+
mca_pml_ob1_accelerator_events_max = 400;
247+
(void) mca_base_component_var_register(&mca_pml_ob1_component.pmlm_version, "accelerator_events_max",
248+
"Number of events created by the ob1 component internally",
249+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5,
250+
MCA_BASE_VAR_SCOPE_READONLY, &mca_pml_ob1_accelerator_events_max);
251+
245252
return OMPI_SUCCESS;
246253
}
247254

0 commit comments

Comments
 (0)