Skip to content

Commit 2c77074

Browse files
authored
V1.4.x Cherry pick (#1128)
* CL/HIER: Remove per node leader, fix double free (#1126) * CL/HIER: per_node_leader, double free fix (cherry picked from commit 86e627c) * CL/HIER: Add flag for nonroot info (#1123) (cherry picked from commit e5c964d)
1 parent d733d17 commit 2c77074

File tree

7 files changed

+63
-73
lines changed

7 files changed

+63
-73
lines changed

src/components/base/ucc_base_iface.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,9 @@ typedef struct ucc_base_team_iface {
163163
} ucc_base_team_iface_t;
164164

165165
enum {
166-
UCC_BASE_CARGS_MAX_FRAG_COUNT = UCC_BIT(0)
166+
UCC_BASE_CARGS_MAX_FRAG_COUNT = UCC_BIT(0),
167+
/* Info is available on nonroot ranks */
168+
UCC_BASE_CARGS_NONROOT_INFO = UCC_BIT(1)
167169
};
168170

169171
typedef struct ucc_buffer_info_asymmetric_memtype {
@@ -179,6 +181,7 @@ typedef struct ucc_base_coll_args {
179181
ucc_coll_args_t args;
180182
ucc_team_t *team;
181183
size_t max_frag_count;
184+
/* For asymmetric mem types across ranks */
182185
ucc_buffer_info_asymmetric_memtype_t asymmetric_save_info;
183186
} ucc_base_coll_args_t;
184187

src/components/cl/hier/allgatherv/allgatherv.c

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ static inline ucc_status_t find_leader_rank(ucc_base_team_t *team,
5454
ucc_assert(team_rank >= 0 && team_rank < UCC_CL_TEAM_SIZE(cl_team));
5555
ucc_assert(SBGP_EXISTS(cl_team, NODE_LEADERS));
5656

57-
status = ucc_topo_get_node_leaders(core_team->topo, &node_leaders, NULL);
57+
status = ucc_topo_get_node_leaders(core_team->topo, &node_leaders);
5858
if (UCC_OK != status) {
5959
cl_error(team->context->lib, "Could not get node leaders");
6060
return status;
@@ -184,7 +184,7 @@ UCC_CL_HIER_PROFILE_FUNC(ucc_status_t, ucc_cl_hier_allgatherv_init,
184184
n_tasks = 0;
185185
UCC_CHECK_GOTO(is_block_ordered(cl_team, &block_ordered), free_sched, status);
186186
UCC_CHECK_GOTO(is_host_ordered(cl_team, &host_ordered), free_sched, status);
187-
is_contig = UCC_COLL_IS_DST_CONTIG(&args.args) && block_ordered && host_ordered;
187+
is_contig = block_ordered && host_ordered && ucc_coll_args_is_disp_contig(&args.args, team_size);
188188
/* handle the case where this rank may be the only one on this node */
189189
ldr_sbgp_only = !SBGP_ENABLED(cl_team, NODE) && SBGP_ENABLED(cl_team, NODE_LEADERS);
190190

@@ -304,6 +304,8 @@ UCC_CL_HIER_PROFILE_FUNC(ucc_status_t, ucc_cl_hier_allgatherv_init,
304304
args.args.dst.info_v.displacements = node_disps;
305305
args.args.dst.info_v.counts = node_counts;
306306
args.args.dst.info_v.buffer = node_gathered_data;
307+
/* Indicate to gatherv it can read the dst counts on nonroots */
308+
args.mask |= UCC_BASE_CARGS_NONROOT_INFO;
307309
}
308310
UCC_CHECK_GOTO(
309311
ucc_coll_init(SCORE_MAP(cl_team, NODE), &args, &tasks[n_tasks]),
@@ -313,11 +315,16 @@ UCC_CL_HIER_PROFILE_FUNC(ucc_status_t, ucc_cl_hier_allgatherv_init,
313315

314316
args = args_old;
315317

316-
/* Need to pack in case its not inplace or the buf isnt contig and we didnt do the gatherv */
318+
/* Need to pack in case it's not inplace or the buf isnt contig and we didnt do the gatherv */
317319
if (ldr_sbgp_only) {
320+
/* Linter assumes true for ldr_sbgp_only, but takes false branch for
321+
if (SBGP_ENABLED(cl_team, NODE_LEADERS)) node_gathered_data = ... */
322+
ucc_assert(node_gathered_data != NULL);
318323
if (!in_place) {
324+
//NOLINTNEXTLINE
319325
memcpy(node_gathered_data, args.args.src.info.buffer, args.args.src.info.count * ucc_dt_size(args.args.src.info.datatype));
320326
} else if (!is_contig) {
327+
//NOLINTNEXTLINE
321328
memcpy(node_gathered_data,
322329
PTR_OFFSET(args.args.dst.info_v.buffer,
323330
dt_size * ucc_coll_args_get_displacement(&args.args, args.args.dst.info_v.displacements, rank)),

src/components/cl/hier/allgatherv/unpack.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ ucc_status_t ucc_cl_hier_allgatherv_unpack_start(ucc_coll_task_t *task)
7070
size_t dst_dt_size = ucc_dt_size(
7171
args->dst.info_v.datatype);
7272
ucc_rank_t *node_leaders = NULL;
73-
ucc_rank_t *per_node_leaders = NULL;
7473
ucc_sbgp_t *all_nodes = NULL;
7574
ucc_sbgp_t *node_leaders_sbgp = NULL;
7675
ucc_topo_t *topo = task->team->params.team->topo;
@@ -93,7 +92,7 @@ ucc_status_t ucc_cl_hier_allgatherv_unpack_start(ucc_coll_task_t *task)
9392

9493
// Get the node leaders
9594
UCC_CHECK_GOTO(
96-
ucc_topo_get_node_leaders(topo, &node_leaders, &per_node_leaders),
95+
ucc_topo_get_node_leaders(topo, &node_leaders),
9796
out, status);
9897

9998
// Get the node leaders subgroup for proper rank mapping
@@ -125,7 +124,6 @@ ucc_status_t ucc_cl_hier_allgatherv_unpack_start(ucc_coll_task_t *task)
125124
src_rank_disp = ucc_coll_args_get_displacement(
126125
args, args->src.info_v.displacements, node_leader_idx);
127126

128-
ucc_assert(per_node_leaders[node_leader_idx] == leader_team_rank);
129127
for (ucc_rank_t j = 0; j < all_nodes[node_leader_idx].group_size; j++) {
130128
if (all_nodes[node_leader_idx].group_size == 1) {
131129
// Node sbgp wont exist for just a single rank on a node

src/components/topo/ucc_topo.c

Lines changed: 19 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,6 @@ ucc_status_t ucc_topo_init(ucc_subset_t set, ucc_context_topo_t *ctx_topo,
186186
topo->all_numas = NULL;
187187
topo->all_nodes = NULL;
188188
topo->node_leaders = NULL;
189-
topo->per_node_leaders = NULL;
190189

191190
*_topo = topo;
192191
return UCC_OK;
@@ -229,9 +228,6 @@ void ucc_topo_cleanup(ucc_topo_t *topo)
229228
if (topo->node_leaders) {
230229
ucc_free(topo->node_leaders);
231230
}
232-
if (topo->per_node_leaders) {
233-
ucc_free(topo->per_node_leaders);
234-
}
235231
ucc_free(topo);
236232
}
237233
}
@@ -324,27 +320,24 @@ ucc_status_t ucc_sbgp_create_all_nodes(ucc_topo_t *topo, ucc_sbgp_t **_sbgps,
324320
ucc_rank_t ldr_team_rank = ucc_ep_map_eval(leader_sbgp->map, i);
325321
topo->set.myrank = ldr_team_rank;
326322
if (ucc_rank_on_local_node(myrank, topo)) {
327-
/* Skip creating sbgp, we have this node's sbgp already */
328-
sbgps[i] = *(ucc_topo_get_sbgp(topo, UCC_SBGP_NODE));
329323
leader_rank = topo->node_leader_rank;
330-
} else {
331-
status = ucc_sbgp_create_node(topo, &sbgps[i]);
332-
if (status == UCC_ERR_NOT_FOUND) {
333-
/* ucc_sbgp_create_node returned because 0 == node_size */
334-
sbgps[i].status = UCC_SBGP_NOT_EXISTS;
335-
continue;
336-
} else if (status != UCC_OK) {
337-
ucc_error("failed to create all_node subgroup %d", i);
338-
goto error;
339-
}
340-
if (sbgps[i].rank_map && sbgps[i].type != UCC_SBGP_FULL) {
341-
sbgps[i].map = ucc_ep_map_from_array(
342-
&sbgps[i].rank_map, sbgps[i].group_size,
343-
ucc_subset_size(&topo->set), 1);
344-
}
345-
if (sbgps[i].rank_map && sbgps[i].status == UCC_SBGP_NOT_EXISTS) {
346-
ucc_free(sbgps[i].rank_map);
347-
}
324+
}
325+
status = ucc_sbgp_create_node(topo, &sbgps[i]);
326+
if (status == UCC_ERR_NOT_FOUND) {
327+
/* ucc_sbgp_create_node returned because 0 == node_size */
328+
sbgps[i].status = UCC_SBGP_NOT_EXISTS;
329+
continue;
330+
} else if (status != UCC_OK) {
331+
ucc_error("failed to create all_node subgroup %d", i);
332+
goto error;
333+
}
334+
if (sbgps[i].rank_map && sbgps[i].type != UCC_SBGP_FULL) {
335+
sbgps[i].map = ucc_ep_map_from_array(
336+
&sbgps[i].rank_map, sbgps[i].group_size,
337+
ucc_subset_size(&topo->set), 1);
338+
}
339+
if (sbgps[i].rank_map && sbgps[i].status == UCC_SBGP_NOT_EXISTS) {
340+
ucc_free(sbgps[i].rank_map);
348341
}
349342
}
350343

@@ -384,8 +377,7 @@ ucc_status_t ucc_topo_get_all_nodes(ucc_topo_t *topo, ucc_sbgp_t **sbgps,
384377
return status;
385378
}
386379

387-
ucc_status_t ucc_topo_get_node_leaders(ucc_topo_t *topo, ucc_rank_t **node_leaders_out,
388-
ucc_rank_t **per_node_leaders_out)
380+
ucc_status_t ucc_topo_get_node_leaders(ucc_topo_t *topo, ucc_rank_t **node_leaders_out)
389381
{
390382
ucc_subset_t *set = &topo->set;
391383
ucc_rank_t size = ucc_subset_size(set);
@@ -397,17 +389,6 @@ ucc_status_t ucc_topo_get_node_leaders(ucc_topo_t *topo, ucc_rank_t **node_leade
397389

398390
if (topo->node_leaders) {
399391
*node_leaders_out = topo->node_leaders;
400-
if (per_node_leaders_out) {
401-
*per_node_leaders_out = topo->per_node_leaders;
402-
}
403-
return UCC_OK;
404-
}
405-
406-
// If we just want the per_node_leaders, return them
407-
if (!node_leaders_out && topo->per_node_leaders) {
408-
ucc_assert(topo->per_node_leaders != NULL);
409-
ucc_assert(per_node_leaders_out != NULL);
410-
*per_node_leaders_out = topo->per_node_leaders;
411392
return UCC_OK;
412393
}
413394

@@ -461,13 +442,9 @@ ucc_status_t ucc_topo_get_node_leaders(ucc_topo_t *topo, ucc_rank_t **node_leade
461442
}
462443

463444
topo->node_leaders = node_leaders;
464-
topo->per_node_leaders = per_node_leaders;
465445
//NOLINTNEXTLINE
466446
*node_leaders_out = node_leaders;
467-
if (per_node_leaders_out) {
468-
//NOLINTNEXTLINE
469-
*per_node_leaders_out = per_node_leaders;
470-
}
471447
ucc_free(ranks_seen_per_node);
448+
ucc_free(per_node_leaders);
472449
return UCC_OK;
473450
}

src/components/topo/ucc_topo.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ typedef struct ucc_topo {
6060
(ucc_team) ranking */
6161
ucc_rank_t *node_leaders; /*< array mapping each rank to its node leader in the original
6262
(ucc_team) ranking, initialized on demand */
63-
ucc_rank_t *per_node_leaders; /*< array of node leaders per node, initialized on demand */
6463
ucc_subset_t set; /*< subset of procs from the ucc_context_topo.
6564
for ucc_team topo it is team->ctx_map */
6665
ucc_rank_t min_ppn; /*< min ppn across the nodes for a team */
@@ -262,7 +261,6 @@ static inline ucc_rank_t ucc_topo_nnodes(ucc_topo_t *topo)
262261
/* Returns node leaders array - array that maps each rank to the TEAM RANK that
263262
is the leader of that rank's node. Also returns per-node leaders array - array
264263
mapping node_id to the TEAM RANK of that node's leader */
265-
ucc_status_t ucc_topo_get_node_leaders(ucc_topo_t *topo, ucc_rank_t **node_leaders_out,
266-
ucc_rank_t **per_node_leaders_out);
264+
ucc_status_t ucc_topo_get_node_leaders(ucc_topo_t *topo, ucc_rank_t **node_leaders_out);
267265

268266
#endif

src/utils/ucc_coll_utils.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,30 @@ ucc_coll_args_get_total_count(const ucc_coll_args_t *args,
167167
return count;
168168
}
169169

170+
/* Check if the displacements are contig, whether or not the user passed
171+
UCC_COLL_ARGS_FLAG_CONTIG_DST_BUFFER in coll args */
172+
static inline int ucc_coll_args_is_disp_contig(const ucc_coll_args_t *args,
173+
ucc_rank_t size)
174+
{
175+
size_t count_accumulator = 0;
176+
size_t disps;
177+
ucc_rank_t i;
178+
179+
if (!UCC_COLL_IS_DST_CONTIG(args)) {
180+
for (i = 0; i < size; i++) {
181+
disps = ucc_coll_args_get_displacement(
182+
args, args->dst.info_v.displacements, i);
183+
if (disps != count_accumulator) {
184+
return 0;
185+
}
186+
count_accumulator += ucc_coll_args_get_count(
187+
args, args->dst.info_v.counts, i);
188+
}
189+
}
190+
191+
return 1;
192+
}
193+
170194
static inline size_t
171195
ucc_coll_args_get_v_buffer_size(const ucc_coll_args_t *args,
172196
const ucc_count_t *counts,

test/gtest/core/test_topo.cc

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -619,7 +619,6 @@ UCC_TEST_F(test_topo, node_leaders)
619619
ucc_subset_t set;
620620
ucc_rank_t i;
621621
ucc_rank_t *node_leaders;
622-
ucc_rank_t *per_node_leaders;
623622
/* simulates world proc array: 2 nodes, 4 ranks per node */
624623
SET_PI(s, 0, 0xaaa, 0, 0); // Node 0, rank 0
625624
SET_PI(s, 1, 0xaaa, 1, 1); // Node 0, rank 1
@@ -640,7 +639,7 @@ UCC_TEST_F(test_topo, node_leaders)
640639
EXPECT_EQ(UCC_OK, ucc_topo_init(set, ctx_topo, &topo));
641640
topo->node_leader_rank_id = 0;
642641
EXPECT_EQ(2, topo->topo->nnodes);
643-
EXPECT_EQ(UCC_OK, ucc_topo_get_node_leaders(topo, &node_leaders, &per_node_leaders));
642+
EXPECT_EQ(UCC_OK, ucc_topo_get_node_leaders(topo, &node_leaders));
644643

645644
/* Verify node leaders array */
646645
// Node 0 ranks should point to rank 0 (first rank on node 0)
@@ -652,15 +651,11 @@ UCC_TEST_F(test_topo, node_leaders)
652651
EXPECT_EQ(4, node_leaders[i]);
653652
}
654653

655-
/* Verify per_node_leaders array */
656-
EXPECT_EQ(0, per_node_leaders[0]); // Node 0 leader
657-
EXPECT_EQ(4, per_node_leaders[1]); // Node 1 leader
658-
659654
/* Test with node_leader_rank_id = 1 */
660655
ucc_topo_cleanup(topo);
661656
EXPECT_EQ(UCC_OK, ucc_topo_init(set, ctx_topo, &topo));
662657
topo->node_leader_rank_id = 1;
663-
EXPECT_EQ(UCC_OK, ucc_topo_get_node_leaders(topo, &node_leaders, &per_node_leaders));
658+
EXPECT_EQ(UCC_OK, ucc_topo_get_node_leaders(topo, &node_leaders));
664659

665660
/* Verify node leaders array */
666661
// Node 0 ranks should point to rank 1 (second rank on node 0)
@@ -672,10 +667,6 @@ UCC_TEST_F(test_topo, node_leaders)
672667
EXPECT_EQ(5, node_leaders[i]);
673668
}
674669

675-
/* Verify per_node_leaders array */
676-
EXPECT_EQ(1, per_node_leaders[0]); // Node 0 leader
677-
EXPECT_EQ(5, per_node_leaders[1]); // Node 1 leader
678-
679670
/* Test with a subset of ranks */
680671
ucc_topo_cleanup(topo);
681672
ucc_rank_t ranks[] = {1, 2, 5, 6}; // Mix of ranks from both nodes
@@ -688,7 +679,7 @@ UCC_TEST_F(test_topo, node_leaders)
688679
/* Test subset with node_leader_rank_id = 0 */
689680
EXPECT_EQ(UCC_OK, ucc_topo_init(set, ctx_topo, &topo));
690681
topo->node_leader_rank_id = 0;
691-
EXPECT_EQ(UCC_OK, ucc_topo_get_node_leaders(topo, &node_leaders, &per_node_leaders));
682+
EXPECT_EQ(UCC_OK, ucc_topo_get_node_leaders(topo, &node_leaders));
692683

693684
/* Verify node leaders array for subset */
694685
// Ranks 0,1 (from node 0) should point to rank 0 (first rank on node 0)
@@ -698,15 +689,11 @@ UCC_TEST_F(test_topo, node_leaders)
698689
EXPECT_EQ(2, node_leaders[2]);
699690
EXPECT_EQ(2, node_leaders[3]);
700691

701-
/* Verify per_node_leaders array */
702-
EXPECT_EQ(0, per_node_leaders[0]); // Node 0 leader
703-
EXPECT_EQ(2, per_node_leaders[1]); // Node 1 leader
704-
705692
/* Test subset with node_leader_rank_id = 1 */
706693
ucc_topo_cleanup(topo);
707694
EXPECT_EQ(UCC_OK, ucc_topo_init(set, ctx_topo, &topo));
708695
topo->node_leader_rank_id = 1;
709-
EXPECT_EQ(UCC_OK, ucc_topo_get_node_leaders(topo, &node_leaders, &per_node_leaders));
696+
EXPECT_EQ(UCC_OK, ucc_topo_get_node_leaders(topo, &node_leaders));
710697

711698
/* Verify node leaders array for subset */
712699
// Ranks 0,1 (from node 0) should point to rank 1 (second rank on node 0)
@@ -715,8 +702,4 @@ UCC_TEST_F(test_topo, node_leaders)
715702
// Ranks 2,3 (from node 1) should point to rank 3 (second rank on node 1)
716703
EXPECT_EQ(3, node_leaders[2]);
717704
EXPECT_EQ(3, node_leaders[3]);
718-
719-
/* Verify per_node_leaders array */
720-
EXPECT_EQ(1, per_node_leaders[0]); // Node 0 leader
721-
EXPECT_EQ(3, per_node_leaders[1]); // Node 1 leader
722705
}

0 commit comments

Comments
 (0)