Skip to content

Commit b0b831f

Browse files
sb17vartpol84
andcommitted
oshmem: Reduce the memory footprint for communicator creation
oshmem uses ompi_proc structure during group ceation. However, only vpids and locality information is used inside oshmem internally. Removing the proc_array from oshmem_group and replacing it with a array to hold the vpids. A static array is used to keep track of locality information which we collect from PMIx. Example: The original ompi_proc_t structure in proc_array was about 112 bytes, while each entry in vpid array will be 4 byte. We use a bitmap to track locality information which is around 1/8 bytes. So, we reduces the memory usage from 112 to ~4.13 per proc in each pe. With 100 nodes and 40 PPN, each proc used to utilize (112*40) = 4480 bytes. Now, after this PR, it can come down to (4.13 * 40) = 165 bytes. So, the total memory utilization per node (w/ 40 ppn) in this codeflow will come down from 179,200 bytes to 6,600 bytes. Co-authored-by: Artem Y. Polyakov <artemp@nvidia.com> Signed-off-by: Subhadeep Bhattacharya <subhadeepb@nvidia.com> Signed-off-by: Artem Polyakov <artemp@nvidia.com>
1 parent 4ddc66e commit b0b831f

17 files changed

+127
-150
lines changed

oshmem/mca/memheap/base/memheap_base_mkey.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -151,9 +151,6 @@ static void unpack_remote_mkeys(shmem_ctx_t ctx, pmix_data_buffer_t *msg, int re
151151
int32_t n;
152152
int32_t tr_id;
153153
int i;
154-
ompi_proc_t *proc;
155-
156-
proc = oshmem_proc_group_find(oshmem_group_all, remote_pe);
157154
cnt = 1;
158155
PMIx_Data_unpack(NULL, msg, &n, &cnt, PMIX_UINT32);
159156
for (i = 0; i < n; i++) {
@@ -168,7 +165,7 @@ static void unpack_remote_mkeys(shmem_ctx_t ctx, pmix_data_buffer_t *msg, int re
168165
if (0 == memheap_oob.mkeys[tr_id].va_base) {
169166
cnt = 1;
170167
PMIx_Data_unpack(NULL, msg, &memheap_oob.mkeys[tr_id].u.key, &cnt, PMIX_UINT64);
171-
if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) {
168+
if (oshmem_proc_on_local_node(remote_pe)) {
172169
memheap_attach_segment(&memheap_oob.mkeys[tr_id], tr_id);
173170
}
174171
} else {

oshmem/mca/scoll/basic/scoll_basic_alltoall.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ get_dst_pe(struct oshmem_group_t *group, int src_blk_idx, int dst_blk_idx, int *
116116
(*dst_pe_idx) = (dst_blk_idx + src_blk_idx) % group->proc_count;
117117

118118
/* convert to the global pe */
119-
return oshmem_proc_pe(group->proc_array[*dst_pe_idx]);
119+
return oshmem_proc_pe_vpid(group, *dst_pe_idx);
120120
}
121121

122122
static int a2as_alg_simple(struct oshmem_group_t *group,

oshmem/mca/scoll/basic/scoll_basic_barrier.c

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ static int _algorithm_central_counter(struct oshmem_group_t *group,
103103
int rc = OSHMEM_SUCCESS;
104104
long value = SHMEM_SYNC_INIT;
105105
int root_id = 0;
106-
int PE_root = oshmem_proc_pe(group->proc_array[root_id]);
106+
int PE_root = oshmem_proc_pe_vpid(group, root_id);
107107
int i = 0;
108108

109109
SCOLL_VERBOSE(12, "[#%d] Barrier algorithm: Central Counter", group->my_pe);
@@ -124,7 +124,7 @@ static int _algorithm_central_counter(struct oshmem_group_t *group,
124124

125125
wait_pe_count = group->proc_count;
126126
for (i = 0; i < group->proc_count; i++) {
127-
wait_pe_array[i] = oshmem_proc_pe(group->proc_array[i]);
127+
wait_pe_array[i] = oshmem_proc_pe_vpid(group, i);
128128
}
129129
wait_pe_array[root_id] = OSHMEM_PE_INVALID;
130130
wait_pe_count--;
@@ -151,7 +151,7 @@ static int _algorithm_central_counter(struct oshmem_group_t *group,
151151
value = SHMEM_SYNC_RUN;
152152
for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS);
153153
i++) {
154-
pe_cur = oshmem_proc_pe(group->proc_array[i]);
154+
pe_cur = oshmem_proc_pe_vpid(group, i);
155155
if (pe_cur != PE_root) {
156156
rc = MCA_SPML_CALL(put(oshmem_ctx_default, (void*)pSync, sizeof(value), (void*)&value, pe_cur));
157157
}
@@ -238,7 +238,7 @@ static int _algorithm_tournament(struct oshmem_group_t *group, long *pSync)
238238
SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round);
239239
rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG));
240240
} else {
241-
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
241+
peer_pe = oshmem_proc_pe_vpid(group, peer_id);
242242

243243
#if 1 /* It is ugly implementation of compare and swap operation
244244
Usage of this hack does not give performance improvement but
@@ -284,7 +284,7 @@ static int _algorithm_tournament(struct oshmem_group_t *group, long *pSync)
284284
for (peer_id = 1;
285285
(peer_id < group->proc_count) && (rc == OSHMEM_SUCCESS);
286286
peer_id++) {
287-
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
287+
peer_pe = oshmem_proc_pe_vpid(group, peer_id);
288288
rc = MCA_SPML_CALL(put(oshmem_ctx_default, (void*)pSync, sizeof(value), (void*)&value, peer_pe));
289289
}
290290
}
@@ -333,7 +333,7 @@ static int _algorithm_recursive_doubling(struct oshmem_group_t *group,
333333
if (my_id >= floor2_proc) {
334334
/* I am in extra group, my partner is node (my_id-y) in basic group */
335335
peer_id = my_id - floor2_proc;
336-
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
336+
peer_pe = oshmem_proc_pe_vpid(group, peer_id);
337337

338338
SCOLL_VERBOSE(14,
339339
"[#%d] is extra and signal to #%d",
@@ -357,7 +357,7 @@ static int _algorithm_recursive_doubling(struct oshmem_group_t *group,
357357
if ((group->proc_count - floor2_proc) > my_id) {
358358
/* I am in basic group, my partner is node (my_id+y) in extra group */
359359
peer_id = my_id + floor2_proc;
360-
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
360+
peer_pe = oshmem_proc_pe_vpid(group, peer_id);
361361

362362
SCOLL_VERBOSE(14,
363363
"[#%d] wait a signal from #%d",
@@ -376,8 +376,7 @@ static int _algorithm_recursive_doubling(struct oshmem_group_t *group,
376376
/* Update exit condition and round counter */
377377
exit_flag >>= 1;
378378
round++;
379-
380-
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
379+
peer_pe = oshmem_proc_pe_vpid(group, peer_id);
381380

382381
#if 1 /* It is ugly implementation of compare and swap operation
383382
Usage of this hack does not give performance improvement but
@@ -420,7 +419,7 @@ static int _algorithm_recursive_doubling(struct oshmem_group_t *group,
420419
if ((group->proc_count - floor2_proc) > my_id) {
421420
/* I am in basic group, my partner is node (my_id+y) in extra group */
422421
peer_id = my_id + floor2_proc;
423-
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
422+
peer_pe = oshmem_proc_pe_vpid(group, peer_id);
424423

425424
SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe);
426425
value = SHMEM_SYNC_RUN;
@@ -462,8 +461,7 @@ static int _algorithm_dissemination(struct oshmem_group_t *group, long *pSync)
462461
for (round = 0; (round <= log2_proc) && (rc == OSHMEM_SUCCESS); round++) {
463462
/* Define a peer to send signal */
464463
peer_id = (my_id + (1 << round)) % group->proc_count;
465-
466-
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
464+
peer_pe = oshmem_proc_pe_vpid(group, peer_id);
467465

468466
#if 1 /* It is ugly implementation of compare and swap operation
469467
Usage of this hack does not give performance improvement but
@@ -502,7 +500,7 @@ static int _algorithm_basic(struct oshmem_group_t *group, long *pSync)
502500
{
503501
int rc = OSHMEM_SUCCESS;
504502
int root_id = 0;
505-
int PE_root = oshmem_proc_pe(group->proc_array[root_id]);
503+
int PE_root = oshmem_proc_pe_vpid(group, root_id);
506504
int i = 0;
507505

508506
SCOLL_VERBOSE(12, "[#%d] Barrier algorithm: Basic", group->my_pe);
@@ -525,7 +523,7 @@ static int _algorithm_basic(struct oshmem_group_t *group, long *pSync)
525523
int pe_cur = 0;
526524

527525
for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) {
528-
pe_cur = oshmem_proc_pe(group->proc_array[i]);
526+
pe_cur = oshmem_proc_pe_vpid(group, i);
529527
if (pe_cur != PE_root) {
530528
rc = MCA_SPML_CALL(recv(NULL, 0, pe_cur));
531529
}
@@ -535,7 +533,7 @@ static int _algorithm_basic(struct oshmem_group_t *group, long *pSync)
535533
}
536534

537535
for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) {
538-
pe_cur = oshmem_proc_pe(group->proc_array[i]);
536+
pe_cur = oshmem_proc_pe_vpid(group, i);
539537
if (pe_cur != PE_root) {
540538
rc = MCA_SPML_CALL(send(NULL, 0, pe_cur, MCA_SPML_BASE_PUT_STANDARD));
541539
}
@@ -564,7 +562,7 @@ static int _algorithm_adaptive(struct oshmem_group_t *group, long *pSync)
564562
if (i == my_id)
565563
continue;
566564

567-
if (!OPAL_PROC_ON_LOCAL_NODE(group->proc_array[i]->super.proc_flags)) {
565+
if (!oshmem_proc_on_local_node(i)) {
568566
local_peers_only = false;
569567
break;
570568
}

oshmem/mca/scoll/basic/scoll_basic_broadcast.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ static int _algorithm_central_counter(struct oshmem_group_t *group,
144144
"[#%d] send data to all PE in the group",
145145
group->my_pe);
146146
for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) {
147-
pe_cur = oshmem_proc_pe(group->proc_array[i]);
147+
pe_cur = oshmem_proc_pe_vpid(group, i);
148148
if (pe_cur != PE_root) {
149149
SCOLL_VERBOSE(15,
150150
"[#%d] send data to #%d",
@@ -233,7 +233,7 @@ static int _algorithm_binomial_tree(struct oshmem_group_t *group,
233233
if (peer_id < group->proc_count) {
234234
/* Wait for the child to be ready to receive (pSync must have the initial value) */
235235
peer_id = (peer_id + root_id) % group->proc_count;
236-
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
236+
peer_pe = oshmem_proc_pe_vpid(group, peer_id);
237237

238238
SCOLL_VERBOSE(14,
239239
"[#%d] check remote pe is ready to receive #%d",

oshmem/mca/scoll/basic/scoll_basic_collect.c

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ static int _algorithm_f_central_counter(struct oshmem_group_t *group,
155155
{
156156
int rc = OSHMEM_SUCCESS;
157157
int i = 0;
158-
int PE_root = oshmem_proc_pe(group->proc_array[0]);
158+
int PE_root = oshmem_proc_pe_vpid(group, 0);
159159

160160
SCOLL_VERBOSE(12,
161161
"[#%d] Collect algorithm: Central Counter (identical size)",
@@ -174,7 +174,7 @@ static int _algorithm_f_central_counter(struct oshmem_group_t *group,
174174
group->my_pe);
175175
for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) {
176176
/* Get PE ID of a peer from the group */
177-
pe_cur = oshmem_proc_pe(group->proc_array[i]);
177+
pe_cur = oshmem_proc_pe_vpid(group, i);
178178

179179
if (pe_cur == group->my_pe)
180180
continue;
@@ -221,7 +221,7 @@ static int _algorithm_f_tournament(struct oshmem_group_t *group,
221221
int my_id = oshmem_proc_group_find_id(group, group->my_pe);
222222
int peer_id = 0;
223223
int peer_pe = 0;
224-
int PE_root = oshmem_proc_pe(group->proc_array[0]);
224+
int PE_root = oshmem_proc_pe_vpid(group, 0);
225225

226226
SCOLL_VERBOSE(12,
227227
"[#%d] Collect algorithm: Tournament (identical size)",
@@ -255,7 +255,7 @@ static int _algorithm_f_tournament(struct oshmem_group_t *group,
255255
SCOLL_VERBOSE(14, "[#%d] round = %d wait", group->my_pe, round);
256256
rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_EQ, (void*)&value, SHMEM_LONG));
257257
} else {
258-
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
258+
peer_pe = oshmem_proc_pe_vpid(group, peer_id);
259259

260260
#if 1 /* It is ugly implementation of compare and swap operation
261261
Usage of this hack does not give performance improvement but
@@ -294,7 +294,7 @@ static int _algorithm_f_tournament(struct oshmem_group_t *group,
294294
for (peer_id = 1;
295295
(peer_id < group->proc_count) && (rc == OSHMEM_SUCCESS);
296296
peer_id++) {
297-
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
297+
peer_pe = oshmem_proc_pe_vpid(group, peer_id);
298298
rc = MCA_SPML_CALL(put(oshmem_ctx_default, (void*)pSync, sizeof(value), (void*)&value, peer_pe));
299299
}
300300
}
@@ -339,7 +339,7 @@ static int _algorithm_f_ring(struct oshmem_group_t *group,
339339
SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld", group->my_pe, pSync[0]);
340340

341341
peer_id = (my_id + 1) % group->proc_count;
342-
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
342+
peer_pe = oshmem_proc_pe_vpid(group, peer_id);
343343
memcpy((void*) ((unsigned char*) target + my_id * nlong),
344344
(void *) source,
345345
nlong);
@@ -420,13 +420,12 @@ static int _algorithm_f_recursive_doubling(struct oshmem_group_t *group,
420420

421421
/* I am in extra group, my partner is node (my_id-y) in basic group */
422422
peer_id = my_id - floor2_proc;
423-
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
423+
peer_pe = oshmem_proc_pe_vpid(group, peer_id);
424424

425425
for (i = 0; (i < group->proc_count) && (rc == OSHMEM_SUCCESS); i++) {
426426
if (i == my_id)
427427
continue;
428-
429-
pe_cur = oshmem_proc_pe(group->proc_array[i]);
428+
pe_cur = oshmem_proc_pe_vpid(group, i);
430429

431430
SCOLL_VERBOSE(14,
432431
"[#%d] is extra send data to #%d",
@@ -450,7 +449,7 @@ static int _algorithm_f_recursive_doubling(struct oshmem_group_t *group,
450449
if ((group->proc_count - floor2_proc) > my_id) {
451450
/* I am in basic group, my partner is node (my_id+y) in extra group */
452451
peer_id = my_id + floor2_proc;
453-
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
452+
peer_pe = oshmem_proc_pe_vpid(group, peer_id);
454453

455454
SCOLL_VERBOSE(14,
456455
"[#%d] wait a signal from #%d",
@@ -469,8 +468,7 @@ static int _algorithm_f_recursive_doubling(struct oshmem_group_t *group,
469468
/* Update exit condition and round counter */
470469
exit_flag >>= 1;
471470
round++;
472-
473-
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
471+
peer_pe = oshmem_proc_pe_vpid(group, peer_id);
474472

475473
#if 1 /* It is ugly implementation of compare and swap operation
476474
Usage of this hack does not give performance improvement but
@@ -507,7 +505,7 @@ static int _algorithm_f_recursive_doubling(struct oshmem_group_t *group,
507505
if ((group->proc_count - floor2_proc) > my_id) {
508506
/* I am in basic group, my partner is node (my_id+y) in extra group */
509507
peer_id = my_id + floor2_proc;
510-
peer_pe = oshmem_proc_pe(group->proc_array[peer_id]);
508+
peer_pe = oshmem_proc_pe_vpid(group, peer_id);
511509

512510
SCOLL_VERBOSE(14,
513511
"[#%d] is extra send data to #%d",
@@ -542,7 +540,7 @@ static int _algorithm_central_collector(struct oshmem_group_t *group,
542540
int rc = OSHMEM_SUCCESS;
543541
size_t offset = 0;
544542
int i = 0;
545-
int PE_root = oshmem_proc_pe(group->proc_array[0]);
543+
int PE_root = oshmem_proc_pe_vpid(group, 0);
546544

547545
SCOLL_VERBOSE(12,
548546
"[#%d] Collect algorithm: Central Counter (vary size)",
@@ -573,7 +571,7 @@ static int _algorithm_central_collector(struct oshmem_group_t *group,
573571
for (i = 1; (i < group->proc_count) && (rc == OSHMEM_SUCCESS);
574572
i++) {
575573
if (wait_pe_array[i] == 0) {
576-
pe_cur = oshmem_proc_pe(group->proc_array[i]);
574+
pe_cur = oshmem_proc_pe_vpid(group, i);
577575
value = 0;
578576
rc = MCA_SPML_CALL(get(oshmem_ctx_default, (void*)pSync, sizeof(value), (void*)&value, pe_cur));
579577
if ((rc == OSHMEM_SUCCESS)
@@ -602,7 +600,7 @@ static int _algorithm_central_collector(struct oshmem_group_t *group,
602600
}
603601

604602
/* Get PE ID of a peer from the group */
605-
pe_cur = oshmem_proc_pe(group->proc_array[i]);
603+
pe_cur = oshmem_proc_pe_vpid(group, i);
606604

607605
/* Get data from the current peer */
608606
rc = MCA_SPML_CALL(get(oshmem_ctx_default, (void *)source, (size_t)wait_pe_array[i], (void*)((unsigned char*)target + offset), pe_cur));

0 commit comments

Comments
 (0)