34
34
#include "coll_base_topo.h"
35
35
#include "coll_base_util.h"
36
36
37
+ /*
38
+ * ompi_coll_base_scatter_intra_binomial
39
+ *
40
+ * Function: Binomial tree algorithm for scatter
41
+ * Accepts: Same as MPI_Scatter
42
+ * Returns: MPI_SUCCESS or error code
43
+ *
44
+ * Time complexity: \alpha\log(p) + \beta*m((p-1)/p),
45
+ * where m = scount * comm_size, p = comm_size
46
+ *
47
+ * Memory requirements (per process):
48
+ * root process (root > 0): scount * comm_size * sdtype_size
49
+ * non-root, non-leaf process: rcount * comm_size * rdtype_size
50
+ *
51
+ * Examples:
52
+ * comm_size=8 comm_size=10 comm_size=12
53
+ * 0 0 0
54
+ * / | \ / / | \ / / \ \
55
+ * 4 2 1 8 4 2 1 8 4 2 1
56
+ * / | | / / | | / | / | |
57
+ * 6 5 3 9 6 5 3 10 9 6 5 3
58
+ * | | | |
59
+ * 7 7 11 7
60
+ */
37
61
int
38
- ompi_coll_base_scatter_intra_binomial ( const void * sbuf , int scount ,
39
- struct ompi_datatype_t * sdtype ,
40
- void * rbuf , int rcount ,
41
- struct ompi_datatype_t * rdtype ,
42
- int root ,
43
- struct ompi_communicator_t * comm ,
44
- mca_coll_base_module_t * module )
62
+ ompi_coll_base_scatter_intra_binomial (
63
+ const void * sbuf , int scount , struct ompi_datatype_t * sdtype ,
64
+ void * rbuf , int rcount , struct ompi_datatype_t * rdtype ,
65
+ int root , struct ompi_communicator_t * comm ,
66
+ mca_coll_base_module_t * module )
45
67
{
46
- int line = -1 , i , rank , vrank , size , total_send = 0 , err ;
68
+ int line = -1 , rank , vrank , size , err ;
47
69
char * ptmp , * tempbuf = NULL ;
48
- ompi_coll_tree_t * bmtree ;
49
70
MPI_Status status ;
50
- mca_coll_base_module_t * base_module = (mca_coll_base_module_t * ) module ;
71
+ mca_coll_base_module_t * base_module = (mca_coll_base_module_t * )module ;
51
72
mca_coll_base_comm_t * data = base_module -> base_data ;
52
73
ptrdiff_t sextent , rextent , ssize , rsize , sgap = 0 , rgap = 0 ;
53
74
54
-
55
75
size = ompi_comm_size (comm );
56
76
rank = ompi_comm_rank (comm );
57
77
58
78
OPAL_OUTPUT ((ompi_coll_base_framework .framework_output ,
59
- "ompi_coll_base_scatter_intra_binomial rank %d" , rank ));
60
-
61
- /* create the binomial tree */
62
- COLL_BASE_UPDATE_IN_ORDER_BMTREE ( comm , base_module , root );
63
- bmtree = data -> cached_in_order_bmtree ;
79
+ "coll:base:scatter_intra_binomial rank %d/%d" , rank , size ));
64
80
65
- ompi_datatype_type_extent (rdtype , & rextent );
66
-
67
- rsize = opal_datatype_span (& rdtype -> super , (int64_t )rcount * size , & rgap );
81
+ /* Create the binomial tree */
82
+ COLL_BASE_UPDATE_IN_ORDER_BMTREE (comm , base_module , root );
83
+ if (NULL == data -> cached_in_order_bmtree ) {
84
+ err = OMPI_ERR_OUT_OF_RESOURCE ; line = __LINE__ ; goto err_hndl ;
85
+ }
86
+ ompi_coll_tree_t * bmtree = data -> cached_in_order_bmtree ;
68
87
69
88
vrank = (rank - root + size ) % size ;
70
- ptmp = (char * ) rbuf ; /* by default suppose leaf nodes, just use rbuf */
89
+ ptmp = (char * )rbuf ; /* by default suppose leaf nodes, just use rbuf */
71
90
72
91
if (rank == root ) {
73
92
ompi_datatype_type_extent (sdtype , & sextent );
74
93
ssize = opal_datatype_span (& sdtype -> super , (int64_t )scount * size , & sgap );
75
94
if (0 == root ) {
76
95
/* root on 0, just use the send buffer */
77
- ptmp = (char * ) sbuf ;
96
+ ptmp = (char * )sbuf ;
78
97
if (rbuf != MPI_IN_PLACE ) {
79
98
/* local copy to rbuf */
80
99
err = ompi_datatype_sndrcv (sbuf , scount , sdtype ,
@@ -83,7 +102,7 @@ ompi_coll_base_scatter_intra_binomial( const void *sbuf, int scount,
83
102
}
84
103
} else {
85
104
/* root is not on 0, allocate temp buffer for send */
86
- tempbuf = (char * ) malloc (ssize );
105
+ tempbuf = (char * )malloc (ssize );
87
106
if (NULL == tempbuf ) {
88
107
err = OMPI_ERR_OUT_OF_RESOURCE ; line = __LINE__ ; goto err_hndl ;
89
108
}
@@ -94,7 +113,6 @@ ompi_coll_base_scatter_intra_binomial( const void *sbuf, int scount,
94
113
ptmp , (char * ) sbuf + sextent * (ptrdiff_t )root * (ptrdiff_t )scount );
95
114
if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
96
115
97
-
98
116
err = ompi_datatype_copy_content_same_ddt (sdtype , (ptrdiff_t )scount * (ptrdiff_t )root ,
99
117
ptmp + sextent * (ptrdiff_t )scount * (ptrdiff_t )(size - root ), (char * )sbuf );
100
118
if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
@@ -106,53 +124,54 @@ ompi_coll_base_scatter_intra_binomial( const void *sbuf, int scount,
106
124
if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
107
125
}
108
126
}
109
- total_send = scount ;
110
127
} else if (!(vrank % 2 )) {
111
- /* non-root, non-leaf nodes, allocte temp buffer for recv
128
+ /* non-root, non-leaf nodes, allocate temp buffer for recv
112
129
* the most we need is rcount*size/2 */
113
- tempbuf = (char * ) malloc (rsize );
130
+ ompi_datatype_type_extent (rdtype , & rextent );
131
+ rsize = opal_datatype_span (& rdtype -> super , (int64_t )rcount * size , & rgap );
132
+ tempbuf = (char * )malloc (rsize / 2 );
114
133
if (NULL == tempbuf ) {
115
- err = OMPI_ERR_OUT_OF_RESOURCE ; line = __LINE__ ; goto err_hndl ;
134
+ err = OMPI_ERR_OUT_OF_RESOURCE ; line = __LINE__ ; goto err_hndl ;
116
135
}
117
136
ptmp = tempbuf - rgap ;
118
-
119
137
sdtype = rdtype ;
120
138
scount = rcount ;
121
139
sextent = rextent ;
122
- total_send = scount ;
123
140
}
124
141
142
+ int curr_count = (rank == root ) ? scount * size : 0 ;
125
143
if (!(vrank % 2 )) {
126
144
if (rank != root ) {
127
145
/* recv from parent on non-root */
128
146
err = MCA_PML_CALL (recv (ptmp , (ptrdiff_t )rcount * (ptrdiff_t )size , rdtype , bmtree -> tree_prev ,
129
147
MCA_COLL_BASE_TAG_SCATTER , comm , & status ));
130
148
if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
149
+
150
+ /* Get received count */
151
+ size_t rdtype_size ;
152
+ ompi_datatype_type_size (rdtype , & rdtype_size );
153
+ curr_count = (int )(status ._ucount / rdtype_size );
154
+
131
155
/* local copy to rbuf */
132
156
err = ompi_datatype_sndrcv (ptmp , scount , sdtype ,
133
157
rbuf , rcount , rdtype );
134
158
if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
135
159
}
136
160
/* send to children on all non-leaf */
137
- for (i = 0 ; i < bmtree -> tree_nextsize ; i ++ ) {
138
- size_t mycount = 0 ;
139
- int vkid ;
161
+ for (int i = bmtree -> tree_nextsize - 1 ; i >= 0 ; i -- ) {
140
162
/* figure out how much data I have to send to this child */
141
- vkid = (bmtree -> tree_next [i ] - root + size ) % size ;
142
- mycount = vkid - vrank ;
143
- if ( (int )mycount > (size - vkid ) )
144
- mycount = size - vkid ;
145
- mycount *= scount ;
146
-
147
- err = MCA_PML_CALL (send (ptmp + (ptrdiff_t )total_send * sextent , mycount , sdtype ,
148
- bmtree -> tree_next [i ],
163
+ int vchild = (bmtree -> tree_next [i ] - root + size ) % size ;
164
+ int send_count = vchild - vrank ;
165
+ if (send_count > size - vchild )
166
+ send_count = size - vchild ;
167
+ send_count *= scount ;
168
+ err = MCA_PML_CALL (send (ptmp + (ptrdiff_t )(curr_count - send_count ) * sextent ,
169
+ send_count , sdtype , bmtree -> tree_next [i ],
149
170
MCA_COLL_BASE_TAG_SCATTER ,
150
171
MCA_PML_BASE_SEND_STANDARD , comm ));
151
172
if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
152
-
153
- total_send += mycount ;
173
+ curr_count -= send_count ;
154
174
}
155
-
156
175
if (NULL != tempbuf )
157
176
free (tempbuf );
158
177
} else {
0 commit comments