Skip to content

Commit 8591403

Browse files
authored
Merge pull request #12391 from bosilca/fix/btl_smcuda_topo
Fix the NUMA detection for the smcuda BTL.
2 parents ee16107 + 6f35a48 commit 8591403

File tree

1 file changed

+61
-57
lines changed

1 file changed

+61
-57
lines changed

opal/mca/btl/smcuda/btl_smcuda.c

Lines changed: 61 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2010-2017 Los Alamos National Security, LLC. All rights
1616
* reserved.
17-
* Copyright (c) 2012-2023 NVIDIA Corporation. All rights reserved.
17+
* Copyright (c) 2012-2024 NVIDIA Corporation. All rights reserved.
1818
* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved.
1919
* Copyright (c) 2014-2017 Research Organization for Information Science
2020
* and Technology (RIST). All rights reserved.
@@ -216,56 +216,52 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
216216
{
217217
size_t length, length_payload;
218218
sm_fifo_t *my_fifos;
219-
int my_mem_node, num_mem_nodes, i, rc;
220219
mca_common_sm_mpool_resources_t *res = NULL;
221-
mca_btl_smcuda_component_t *m = &mca_btl_smcuda_component;
222220
char *loc, *mynuma;
223221
opal_process_name_t wildcard_rank;
222+
int rc;
224223

225224
/* Assume we don't have hwloc support and fill in dummy info */
226-
mca_btl_smcuda_component.mem_node = my_mem_node = 0;
227-
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = 1;
225+
mca_btl_smcuda_component.mem_node = -1;
226+
mca_btl_smcuda_component.num_mem_nodes = 1;
228227

229228
/* see if we were given a topology signature */
230229
wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
231230
wildcard_rank.vpid = OPAL_VPID_WILDCARD;
232231
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_TOPOLOGY_SIGNATURE, &wildcard_rank, &loc, PMIX_STRING);
233232
if (OPAL_SUCCESS == rc) {
234233
/* the number of NUMA nodes is right at the front */
235-
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = strtoul(loc, NULL, 10);
234+
mca_btl_smcuda_component.num_mem_nodes = strtoul(loc, NULL, 10);
236235
free(loc);
237236
} else {
238237
/* If we have hwloc support, then get accurate information */
239238
loc = NULL;
240239
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
241-
i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, 0,
242-
OPAL_HWLOC_AVAILABLE);
240+
rc = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, 0,
241+
OPAL_HWLOC_AVAILABLE);
243242

244243
/* JMS This tells me how many numa nodes are *available*,
245244
but it's not how many are being used *by this job*.
246245
Note that this is the value we've previously used (from
247246
the previous carto-based implementation), but it really
248247
should be improved to be how many NUMA nodes are being
249248
used *in this job*. */
250-
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = i;
249+
mca_btl_smcuda_component.num_mem_nodes = rc;
251250
}
252251
}
253252
/* see if we were given our location */
254253
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, &OPAL_PROC_MY_NAME, &loc, PMIX_STRING);
255254
if (OPAL_SUCCESS == rc) {
256-
if (NULL == loc) {
257-
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
258-
} else {
255+
if (NULL != loc) {
259256
/* get our NUMA location */
260257
mynuma = opal_hwloc_base_get_location(loc, HWLOC_OBJ_NODE, 0);
261258
if (NULL == mynuma || NULL != strchr(mynuma, ',') || NULL != strchr(mynuma, '-')) {
262259
/* we either have no idea what NUMA we are on, or we
263260
* are on multiple NUMA nodes */
264-
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
261+
mca_btl_smcuda_component.mem_node = -1;
265262
} else {
266263
/* we are bound to a single NUMA node */
267-
my_mem_node = strtoul(mynuma, NULL, 10);
268-
mca_btl_smcuda_component.mem_node = my_mem_node;
264+
mca_btl_smcuda_component.mem_node = strtoul(mynuma, NULL, 10);
269265
}
270266
if (NULL != mynuma) {
271267
free(mynuma);
@@ -274,14 +270,14 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
274270
}
275271
} else {
276272
/* If we have hwloc support, then get accurate information */
277-
if (OPAL_SUCCESS == opal_hwloc_base_get_topology() && num_mem_nodes > 0
273+
if (OPAL_SUCCESS == opal_hwloc_base_get_topology() && mca_btl_smcuda_component.num_mem_nodes > 0
278274
&& NULL != opal_process_info.cpuset) {
279275
int numa = 0, w;
280276
unsigned n_bound = 0;
281277
hwloc_obj_t obj;
282278

283279
/* count the number of NUMA nodes to which we are bound */
284-
for (w = 0; w < i; w++) {
280+
for (w = 0; w < mca_btl_smcuda_component.num_mem_nodes; w++) {
285281
if (NULL
286282
== (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE,
287283
0, w, OPAL_HWLOC_AVAILABLE))) {
@@ -297,27 +293,35 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
297293
* a NUMA we are on, then not much we can do
298294
*/
299295
if (1 == n_bound) {
300-
mca_btl_smcuda_component.mem_node = my_mem_node = numa;
301-
} else {
302-
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
296+
mca_btl_smcuda_component.mem_node = numa;
303297
}
304298
}
305299
}
300+
/* sanity check: do we have the NUMA node info ? */
301+
if( mca_btl_smcuda_component.mem_node < 0 ||
302+
mca_btl_smcuda_component.num_mem_nodes < 1) {
303+
opal_output_verbose(10, opal_btl_base_framework.framework_output,
304+
"btl:smcuda: %s unable to find topological information mem_node=%d, num_mem_nodes=%d",
305+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
306+
mca_btl_smcuda_component.mem_node, mca_btl_smcuda_component.num_mem_nodes);
307+
mca_btl_smcuda_component.mem_node = 0;
308+
mca_btl_smcuda_component.num_mem_nodes = 1;
309+
}
306310

307311
if (NULL == (res = calloc(1, sizeof(*res)))) {
308312
return OPAL_ERR_OUT_OF_RESOURCE;
309313
}
310314

311315
/* lookup shared memory pool */
312316
mca_btl_smcuda_component.sm_mpools = (mca_mpool_base_module_t **)
313-
calloc(num_mem_nodes, sizeof(mca_mpool_base_module_t *));
317+
calloc(mca_btl_smcuda_component.num_mem_nodes, sizeof(mca_mpool_base_module_t *));
314318

315319
/* Disable memory binding, because each MPI process will claim pages in the
316320
* mpool for their local NUMA node */
317321
res->mem_node = -1;
318322
res->allocator = mca_btl_smcuda_component.allocator;
319323

320-
if (OPAL_SUCCESS != (rc = setup_mpool_base_resources(m, res))) {
324+
if (OPAL_SUCCESS != (rc = setup_mpool_base_resources(&mca_btl_smcuda_component, res))) {
321325
free(res);
322326
return rc;
323327
}
@@ -344,7 +348,7 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
344348

345349
/* remember that node rank zero is already attached */
346350
if (0 != my_smp_rank) {
347-
if (OPAL_SUCCESS != (rc = sm_segment_attach(m))) {
351+
if (OPAL_SUCCESS != (rc = sm_segment_attach(&mca_btl_smcuda_component))) {
348352
free(res);
349353
return rc;
350354
}
@@ -357,7 +361,7 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
357361
"btl:smcuda: host_register address=%p, size=%d",
358362
mca_btl_smcuda_component.sm_mpool_base, (int) res->size);
359363
if (0 != strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "null")) {
360-
rc = opal_accelerator.host_register(MCA_ACCELERATOR_NO_DEVICE_ID, mca_btl_smcuda_component.sm_mpool_base, res->size);
364+
rc = opal_accelerator.host_register(MCA_ACCELERATOR_NO_DEVICE_ID, mca_btl_smcuda_component.sm_mpool_base, res->size);
361365
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
362366
/* If registering the memory fails, print a message and continue.
363367
* This is not a fatal error. */
@@ -394,7 +398,7 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
394398
mca_btl_smcuda_component.shm_bases[mca_btl_smcuda_component.my_smp_rank]
395399
= (char *) mca_btl_smcuda_component.sm_mpool_base;
396400
mca_btl_smcuda_component.shm_mem_nodes[mca_btl_smcuda_component.my_smp_rank] = (uint16_t)
397-
my_mem_node;
401+
mca_btl_smcuda_component.mem_node;
398402

399403
/* initialize the array of fifo's "owned" by this process */
400404
if (NULL == (my_fifos = (sm_fifo_t *) mpool_calloc(FIFO_MAP_NUM(n), sizeof(sm_fifo_t))))
@@ -420,45 +424,45 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
420424
/* allocation will be for the fragment descriptor and payload buffer */
421425
length = sizeof(mca_btl_smcuda_frag1_t);
422426
length_payload = sizeof(mca_btl_smcuda_hdr_t) + mca_btl_smcuda_component.eager_limit;
423-
i = opal_free_list_init(&mca_btl_smcuda_component.sm_frags_eager, length, opal_cache_line_size,
424-
OBJ_CLASS(mca_btl_smcuda_frag1_t), length_payload, opal_cache_line_size,
425-
mca_btl_smcuda_component.sm_free_list_num,
426-
mca_btl_smcuda_component.sm_free_list_max,
427-
mca_btl_smcuda_component.sm_free_list_inc,
428-
mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
429-
if (OPAL_SUCCESS != i)
430-
return i;
427+
rc = opal_free_list_init(&mca_btl_smcuda_component.sm_frags_eager, length, opal_cache_line_size,
428+
OBJ_CLASS(mca_btl_smcuda_frag1_t), length_payload, opal_cache_line_size,
429+
mca_btl_smcuda_component.sm_free_list_num,
430+
mca_btl_smcuda_component.sm_free_list_max,
431+
mca_btl_smcuda_component.sm_free_list_inc,
432+
mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
433+
if (OPAL_SUCCESS != rc)
434+
return rc;
431435

432436
length = sizeof(mca_btl_smcuda_frag2_t);
433437
length_payload = sizeof(mca_btl_smcuda_hdr_t) + mca_btl_smcuda_component.max_frag_size;
434-
i = opal_free_list_init(&mca_btl_smcuda_component.sm_frags_max, length, opal_cache_line_size,
435-
OBJ_CLASS(mca_btl_smcuda_frag2_t), length_payload, opal_cache_line_size,
436-
mca_btl_smcuda_component.sm_free_list_num,
437-
mca_btl_smcuda_component.sm_free_list_max,
438-
mca_btl_smcuda_component.sm_free_list_inc,
439-
mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
440-
if (OPAL_SUCCESS != i)
441-
return i;
442-
443-
i = opal_free_list_init(&mca_btl_smcuda_component.sm_frags_user, sizeof(mca_btl_smcuda_user_t),
444-
opal_cache_line_size, OBJ_CLASS(mca_btl_smcuda_user_t),
445-
sizeof(mca_btl_smcuda_hdr_t), opal_cache_line_size,
446-
mca_btl_smcuda_component.sm_free_list_num,
447-
mca_btl_smcuda_component.sm_free_list_max,
448-
mca_btl_smcuda_component.sm_free_list_inc,
449-
mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
450-
if (OPAL_SUCCESS != i)
451-
return i;
438+
rc = opal_free_list_init(&mca_btl_smcuda_component.sm_frags_max, length, opal_cache_line_size,
439+
OBJ_CLASS(mca_btl_smcuda_frag2_t), length_payload, opal_cache_line_size,
440+
mca_btl_smcuda_component.sm_free_list_num,
441+
mca_btl_smcuda_component.sm_free_list_max,
442+
mca_btl_smcuda_component.sm_free_list_inc,
443+
mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
444+
if (OPAL_SUCCESS != rc)
445+
return rc;
446+
447+
rc = opal_free_list_init(&mca_btl_smcuda_component.sm_frags_user, sizeof(mca_btl_smcuda_user_t),
448+
opal_cache_line_size, OBJ_CLASS(mca_btl_smcuda_user_t),
449+
sizeof(mca_btl_smcuda_hdr_t), opal_cache_line_size,
450+
mca_btl_smcuda_component.sm_free_list_num,
451+
mca_btl_smcuda_component.sm_free_list_max,
452+
mca_btl_smcuda_component.sm_free_list_inc,
453+
mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
454+
if (OPAL_SUCCESS != rc)
455+
return rc;
452456

453457
mca_btl_smcuda_component.num_outstanding_frags = 0;
454458

455459
mca_btl_smcuda_component.num_pending_sends = 0;
456-
i = opal_free_list_init(&mca_btl_smcuda_component.pending_send_fl,
457-
sizeof(btl_smcuda_pending_send_item_t), 8,
458-
OBJ_CLASS(opal_free_list_item_t), 0, 0, 16, -1, 32, NULL, 0, NULL, NULL,
459-
NULL);
460-
if (OPAL_SUCCESS != i)
461-
return i;
460+
rc = opal_free_list_init(&mca_btl_smcuda_component.pending_send_fl,
461+
sizeof(btl_smcuda_pending_send_item_t), 8,
462+
OBJ_CLASS(opal_free_list_item_t), 0, 0, 16, -1, 32, NULL, 0, NULL, NULL,
463+
NULL);
464+
if (OPAL_SUCCESS != rc)
465+
return rc;
462466

463467
/* set flag indicating btl has been inited */
464468
smcuda_btl->btl_inited = true;

0 commit comments

Comments
 (0)