@@ -1142,3 +1142,169 @@ int ompi_coll_base_reduce_intra_redscat_gather(
1142
1142
free (scount );
1143
1143
return err ;
1144
1144
}
1145
+
1146
+ /*
1147
+ * ompi_coll_base_reduce_intra_knomial
1148
+ *
1149
+ * Function: reduce using k-nomial tree algorithm
1150
+ * Accepts: Same arguments as MPI_Reduce, plus radix
1151
+ * Returns: MPI_SUCCESS or error code
1152
+ * Parameters: radix -- k-nomial tree radix (>= 2)
1153
+ *
1154
+ * Time complexity: (radix - 1)O(\log_{radix}(comm_size))
1155
+ *
1156
+ * Example, comm_size=10
1157
+ * radix=2 radix=3 radix=4
1158
+ * 0 0 0
1159
+ * / / \ \ / / | \ \ / / \ \ \
1160
+ * 8 4 2 1 9 3 6 1 2 4 8 1 2 3
1161
+ * | |\ | |\ |\ /|\ |
1162
+ * 9 6 5 3 4 5 7 8 5 6 7 9
1163
+ * |
1164
+ * 7
1165
+ */
1166
+ int ompi_coll_base_reduce_intra_knomial ( const void * sendbuf , void * recvbuf ,
1167
+ int count , ompi_datatype_t * datatype ,
1168
+ ompi_op_t * op , int root ,
1169
+ ompi_communicator_t * comm ,
1170
+ mca_coll_base_module_t * module ,
1171
+ uint32_t segsize ,
1172
+ int max_outstanding_reqs , int radix )
1173
+ {
1174
+ int err = OMPI_SUCCESS , rank , line ;
1175
+ ptrdiff_t extent , lb ;
1176
+ size_t dtype_size ;
1177
+ char * child_buf = NULL ;
1178
+ char * child_buf_start = NULL ;
1179
+ char * reduce_buf = NULL ;
1180
+ char * reduce_buf_start = NULL ;
1181
+ char * sendtmpbuf = NULL ;
1182
+ mca_coll_base_module_t * base_module = (mca_coll_base_module_t * ) module ;
1183
+ mca_coll_base_comm_t * data = base_module -> base_data ;
1184
+ ompi_coll_tree_t * tree ;
1185
+ int num_children ;
1186
+ bool is_leaf ;
1187
+ ptrdiff_t buf_size , gap = 0 ;
1188
+ int max_reqs = 0 , num_reqs ;
1189
+ ompi_request_t * * reqs ;
1190
+
1191
+ OPAL_OUTPUT ((ompi_coll_base_framework .framework_output , "coll:base:ompi_coll_base_reduce_intra_knomial msg size %d, max_requests %d" ,
1192
+ count , max_outstanding_reqs ));
1193
+
1194
+ rank = ompi_comm_rank (comm );
1195
+
1196
+ // create a k-nomial tree with radix 4
1197
+ COLL_BASE_UPDATE_KMTREE (comm , base_module , root , radix );
1198
+ if (NULL == data -> cached_kmtree ) {
1199
+ // fail to create knomial tree fallback to previous allreduce method
1200
+ OPAL_OUTPUT ((ompi_coll_base_framework .framework_output ,
1201
+ "REDUCE: failed to create knomial tree. \n" ));
1202
+ goto err_hndl ;
1203
+ }
1204
+
1205
+ tree = data -> cached_kmtree ;
1206
+ num_children = tree -> tree_nextsize ;
1207
+ is_leaf = (tree -> tree_nextsize == 0 ) ? true : false;
1208
+
1209
+ ompi_datatype_get_extent (datatype , & lb , & extent );
1210
+ ompi_datatype_type_size (datatype , & dtype_size );
1211
+
1212
+ sendtmpbuf = (char * ) sendbuf ;
1213
+ if ( sendbuf == MPI_IN_PLACE ) {
1214
+ sendtmpbuf = (char * )recvbuf ;
1215
+ }
1216
+ buf_size = opal_datatype_span (& datatype -> super , (int64_t )count , & gap );
1217
+ reduce_buf = (char * )malloc (buf_size );
1218
+ reduce_buf_start = reduce_buf - gap ;
1219
+ err = ompi_datatype_copy_content_same_ddt (datatype , count ,
1220
+ (char * )reduce_buf_start ,
1221
+ (char * )sendtmpbuf );
1222
+ if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
1223
+
1224
+ // do transfer in a single transaction instead of segments
1225
+ num_reqs = 0 ;
1226
+ max_reqs = num_children ;
1227
+ if (!is_leaf ) {
1228
+ buf_size = opal_datatype_span (& datatype -> super , (int64_t )count * num_children , & gap );
1229
+ child_buf = (char * )malloc (buf_size );
1230
+ child_buf_start = child_buf - gap ;
1231
+ reqs = ompi_coll_base_comm_get_reqs (data , max_reqs );
1232
+ }
1233
+
1234
+ for (int i = 0 ; i < num_children ; i ++ ) {
1235
+ int child = tree -> tree_next [i ];
1236
+ err = MCA_PML_CALL (irecv (child_buf_start + (ptrdiff_t )i * count * extent ,
1237
+ count ,
1238
+ datatype ,
1239
+ child ,
1240
+ MCA_COLL_BASE_TAG_REDUCE ,
1241
+ comm ,
1242
+ & reqs [num_reqs ++ ]));
1243
+ if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
1244
+ }
1245
+
1246
+ if (num_reqs > 0 ) {
1247
+ err = ompi_request_wait_all (num_reqs , reqs , MPI_STATUS_IGNORE );
1248
+ if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
1249
+ }
1250
+
1251
+ for (int i = 0 ; i < num_children ; i ++ ) {
1252
+ ompi_op_reduce (op ,
1253
+ child_buf_start + (ptrdiff_t )i * count * extent ,
1254
+ reduce_buf ,
1255
+ count ,
1256
+ datatype );
1257
+ }
1258
+
1259
+ if (rank != root ) {
1260
+ err = MCA_PML_CALL (send (reduce_buf_start ,
1261
+ count ,
1262
+ datatype ,
1263
+ tree -> tree_prev ,
1264
+ MCA_COLL_BASE_TAG_REDUCE ,
1265
+ MCA_PML_BASE_SEND_STANDARD ,
1266
+ comm ));
1267
+ if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
1268
+ }
1269
+
1270
+ if (rank == root ) {
1271
+ err = ompi_datatype_copy_content_same_ddt (datatype , count ,
1272
+ (char * )recvbuf ,
1273
+ (char * )reduce_buf_start );
1274
+ if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
1275
+ }
1276
+
1277
+ if (NULL != child_buf ) free (child_buf );
1278
+ if (NULL != reduce_buf ) free (reduce_buf );
1279
+ return MPI_SUCCESS ;
1280
+
1281
+ err_hndl :
1282
+ if (NULL != child_buf ) {
1283
+ free (child_buf );
1284
+ child_buf = NULL ;
1285
+ child_buf_start = NULL ;
1286
+ }
1287
+ if (NULL != reduce_buf ) {
1288
+ free (reduce_buf );
1289
+ reduce_buf = NULL ;
1290
+ reduce_buf_start = NULL ;
1291
+ }
1292
+ if ( NULL != reqs ) {
1293
+ if (MPI_ERR_IN_STATUS == err ) {
1294
+ for ( num_reqs = 0 ; num_reqs < tree -> tree_nextsize ; num_reqs ++ ) {
1295
+ if (MPI_REQUEST_NULL == reqs [num_reqs ]) continue ;
1296
+ if (MPI_ERR_PENDING == reqs [num_reqs ]-> req_status .MPI_ERROR ) continue ;
1297
+ if (reqs [num_reqs ]-> req_status .MPI_ERROR != MPI_SUCCESS ) {
1298
+ err = reqs [num_reqs ]-> req_status .MPI_ERROR ;
1299
+ break ;
1300
+ }
1301
+ }
1302
+ }
1303
+ ompi_coll_base_free_reqs (reqs , max_reqs );
1304
+ }
1305
+ OPAL_OUTPUT ((ompi_coll_base_framework .framework_output , "%s:%4d\tError occurred %d, rank %2d" ,
1306
+ __FILE__ , line , err , rank ));
1307
+ (void )line ; // silence compiler warning
1308
+ return err ;
1309
+
1310
+ }
0 commit comments