Skip to content

Commit 91efa83

Browse files
authored
Merge pull request #8383 from bosilca/fix/scatter
A new binomial scatter using packed data on intermediary processes.
2 parents e604107 + 21e4d87 commit 91efa83

File tree

1 file changed

+82
-84
lines changed

1 file changed

+82
-84
lines changed

ompi/mca/coll/base/coll_base_scatter.c

Lines changed: 82 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,14 @@ ompi_coll_base_scatter_intra_binomial(
6666
int root, struct ompi_communicator_t *comm,
6767
mca_coll_base_module_t *module)
6868
{
69-
int line = -1, rank, vrank, size, err;
70-
char *ptmp, *tempbuf = NULL;
71-
MPI_Status status;
7269
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*)module;
7370
mca_coll_base_comm_t *data = base_module->base_data;
74-
ptrdiff_t sextent, rextent, ssize, rsize, sgap = 0, rgap = 0;
71+
int line = -1, rank, vrank, size, err, packed_size, curr_count;
72+
char *ptmp, *tempbuf = NULL;
73+
size_t max_data, packed_sizet;
74+
opal_convertor_t convertor;
75+
ptrdiff_t sextent;
76+
MPI_Status status;
7577

7678
size = ompi_comm_size(comm);
7779
rank = ompi_comm_rank(comm);
@@ -89,99 +91,95 @@ ompi_coll_base_scatter_intra_binomial(
8991
vrank = (rank - root + size) % size;
9092
ptmp = (char *)rbuf; /* by default suppose leaf nodes, just use rbuf */
9193

92-
if (rank == root) {
94+
if ( vrank % 2 ) { /* leaves */
95+
/* recv from parent on leaf nodes */
96+
err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, bmtree->tree_prev,
97+
MCA_COLL_BASE_TAG_SCATTER, comm, &status));
98+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
99+
return MPI_SUCCESS;
100+
101+
}
102+
OBJ_CONSTRUCT( &convertor, opal_convertor_t );
103+
if (rank == root) { /* root and non-leafs */
93104
ompi_datatype_type_extent(sdtype, &sextent);
94-
ssize = opal_datatype_span(&sdtype->super, (int64_t)scount * size, &sgap);
95-
if (0 == root) {
96-
/* root on 0, just use the send buffer */
97-
ptmp = (char *)sbuf;
98-
if (rbuf != MPI_IN_PLACE) {
99-
/* local copy to rbuf */
100-
err = ompi_datatype_sndrcv(sbuf, scount, sdtype,
101-
rbuf, rcount, rdtype);
102-
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
103-
}
104-
} else {
105-
/* root is not on 0, allocate temp buffer for send */
106-
tempbuf = (char *)malloc(ssize);
105+
ptmp = (char *)sbuf; /* if root == 0, just use the send buffer */
106+
if (0 != root) {
107+
opal_convertor_copy_and_prepare_for_send( ompi_mpi_local_convertor, &(sdtype->super),
108+
scount * size, sbuf, 0, &convertor );
109+
opal_convertor_get_packed_size( &convertor, &packed_sizet );
110+
packed_size = (int)packed_sizet;
111+
packed_sizet = packed_sizet / size;
112+
ptmp = tempbuf = (char *)malloc(packed_size);
107113
if (NULL == tempbuf) {
108114
err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
109115
}
110-
ptmp = tempbuf - sgap;
111-
112-
/* and rotate data so they will eventually in the right place */
113-
err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)(size - root),
114-
ptmp, (char *) sbuf + sextent * (ptrdiff_t)root * (ptrdiff_t)scount);
115-
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
116-
117-
err = ompi_datatype_copy_content_same_ddt(sdtype, (ptrdiff_t)scount * (ptrdiff_t)root,
118-
ptmp + sextent * (ptrdiff_t)scount * (ptrdiff_t)(size - root), (char *)sbuf);
119-
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
120-
121-
if (rbuf != MPI_IN_PLACE) {
122-
/* local copy to rbuf */
123-
err = ompi_datatype_sndrcv(ptmp, scount, sdtype,
124-
rbuf, rcount, rdtype);
125-
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
126-
}
116+
/* rotate data so they will eventually be in the right place */
117+
struct iovec iov[1];
118+
uint32_t iov_size = 1;
119+
120+
iov[0].iov_base = ptmp + (ptrdiff_t)(size - root) * packed_sizet;
121+
iov[0].iov_len = max_data = packed_sizet * (ptrdiff_t)root;
122+
opal_convertor_pack(&convertor, iov, &iov_size, &max_data);
123+
124+
iov[0].iov_base = ptmp;
125+
iov[0].iov_len = max_data = packed_sizet * (ptrdiff_t)(size - root);
126+
opal_convertor_pack(&convertor, iov, &iov_size, &max_data);
127+
OBJ_DESTRUCT(&convertor);
128+
129+
sdtype = MPI_PACKED;
130+
sextent = 1; /* bytes */
131+
scount = packed_size / size;
127132
}
128-
} else if (!(vrank % 2)) {
129-
/* non-root, non-leaf nodes, allocate temp buffer for recv
130-
* the most we need is rcount*size/2 */
131-
ompi_datatype_type_extent(rdtype, &rextent);
132-
rsize = opal_datatype_span(&rdtype->super, (int64_t)rcount * size, &rgap);
133-
tempbuf = (char *)malloc(rsize / 2);
133+
curr_count = scount * size;
134+
} else { /* (!(vrank % 2)) */
135+
opal_convertor_copy_and_prepare_for_send( ompi_mpi_local_convertor, &(rdtype->super),
136+
rcount, NULL, 0, &convertor );
137+
opal_convertor_get_packed_size( &convertor, &packed_sizet );
138+
scount = (int)packed_sizet;
139+
140+
sdtype = MPI_PACKED; /* default to MPI_PACKED as the send type */
141+
packed_size = scount * (size+1)/2; /* non-root, non-leaf nodes, allocate temp buffer for recv
142+
* the most we need is rcount*size/2 */
143+
ptmp = tempbuf = (char *)malloc(packed_size);
134144
if (NULL == tempbuf) {
135145
err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
136146
}
137-
ptmp = tempbuf - rgap;
138-
sdtype = rdtype;
139-
scount = rcount;
140-
sextent = rextent;
141-
}
142147

143-
int curr_count = (rank == root) ? scount * size : 0;
144-
if (!(vrank % 2)) {
145-
if (rank != root) {
146-
/* recv from parent on non-root */
147-
err = MCA_PML_CALL(recv(ptmp, (ptrdiff_t)rcount * (ptrdiff_t)size, rdtype, bmtree->tree_prev,
148-
MCA_COLL_BASE_TAG_SCATTER, comm, &status));
149-
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
150-
151-
/* Get received count */
152-
size_t rdtype_size;
153-
ompi_datatype_type_size(rdtype, &rdtype_size);
154-
curr_count = (int)(status._ucount / rdtype_size);
155-
156-
/* local copy to rbuf */
157-
err = ompi_datatype_sndrcv(ptmp, scount, sdtype,
158-
rbuf, rcount, rdtype);
159-
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
160-
}
161-
/* send to children on all non-leaf */
162-
for (int i = bmtree->tree_nextsize - 1; i >= 0; i--) {
163-
/* figure out how much data I have to send to this child */
164-
int vchild = (bmtree->tree_next[i] - root + size) % size;
165-
int send_count = vchild - vrank;
166-
if (send_count > size - vchild)
167-
send_count = size - vchild;
168-
send_count *= scount;
169-
err = MCA_PML_CALL(send(ptmp + (ptrdiff_t)(curr_count - send_count) * sextent,
170-
send_count, sdtype, bmtree->tree_next[i],
171-
MCA_COLL_BASE_TAG_SCATTER,
172-
MCA_PML_BASE_SEND_STANDARD, comm));
173-
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
174-
curr_count -= send_count;
175-
}
176-
if (NULL != tempbuf)
177-
free(tempbuf);
178-
} else {
179-
/* recv from parent on leaf nodes */
180-
err = MCA_PML_CALL(recv(ptmp, rcount, rdtype, bmtree->tree_prev,
148+
/* recv from parent on non-root */
149+
err = MCA_PML_CALL(recv(ptmp, (ptrdiff_t)packed_size, MPI_PACKED, bmtree->tree_prev,
181150
MCA_COLL_BASE_TAG_SCATTER, comm, &status));
182151
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
152+
153+
/* Get received count */
154+
curr_count = (int)status._ucount; /* no need for conversion, work in bytes */
155+
sextent = 1; /* bytes */
183156
}
184157

158+
if (rbuf != MPI_IN_PLACE) { /* local copy to rbuf */
159+
err = ompi_datatype_sndrcv(ptmp, scount, sdtype,
160+
rbuf, rcount, rdtype);
161+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
162+
}
163+
164+
/* send to children on all non-leaf */
165+
for (int i = bmtree->tree_nextsize - 1; i >= 0; i--) {
166+
/* figure out how much data I have to send to this child */
167+
int vchild = (bmtree->tree_next[i] - root + size) % size;
168+
int send_count = vchild - vrank;
169+
if (send_count > size - vchild)
170+
send_count = size - vchild;
171+
send_count *= scount;
172+
173+
err = MCA_PML_CALL(send(ptmp + (ptrdiff_t)(curr_count - send_count) * sextent,
174+
send_count, sdtype, bmtree->tree_next[i],
175+
MCA_COLL_BASE_TAG_SCATTER,
176+
MCA_PML_BASE_SEND_STANDARD, comm));
177+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
178+
curr_count -= send_count;
179+
}
180+
if (NULL != tempbuf)
181+
free(tempbuf);
182+
185183
return MPI_SUCCESS;
186184

187185
err_hndl:

0 commit comments

Comments
 (0)