|
25 | 25 |
|
26 | 26 | #include "ompi_config.h"
|
27 | 27 |
|
| 28 | +#include "math.h" |
| 29 | + |
28 | 30 | #include "mpi.h"
|
29 | 31 | #include "ompi/constants.h"
|
30 | 32 | #include "ompi/datatype/ompi_datatype.h"
|
@@ -202,6 +204,154 @@ int ompi_coll_base_allgatherv_intra_bruck(const void *sbuf, int scount,
|
202 | 204 | return err;
|
203 | 205 | }
|
204 | 206 |
|
| 207 | +/* |
| 208 | + * ompi_coll_base_allgather_intra_sparbit |
| 209 | + * |
| 210 | + * Function: allgather using O(log(N)) steps. |
| 211 | + * Accepts: Same arguments as MPI_Allgather |
| 212 | + * Returns: MPI_SUCCESS or error code |
| 213 | + * |
| 214 | + * Description: Proposal of an allgather algorithm similar to Bruck but with inverted distances |
| 215 | + * and non-decreasing exchanged data sizes. Described in "Sparbit: a new |
| 216 | + * logarithmic-cost and data locality-aware MPI Allgather algorithm". |
| 217 | + * |
| 218 | + * Memory requirements: |
| 219 | + * Additional memory for N requests. |
| 220 | + * |
| 221 | + * Example on 6 nodes, with l representing the highest power of two smaller than N, in this case l = |
| 222 | + * 4 (more details can be found on the paper): |
| 223 | + * Initial state |
| 224 | + * # 0 1 2 3 4 5 |
| 225 | + * [0] [ ] [ ] [ ] [ ] [ ] |
| 226 | + * [ ] [1] [ ] [ ] [ ] [ ] |
| 227 | + * [ ] [ ] [2] [ ] [ ] [ ] |
| 228 | + * [ ] [ ] [ ] [3] [ ] [ ] |
| 229 | + * [ ] [ ] [ ] [ ] [4] [ ] |
| 230 | + * [ ] [ ] [ ] [ ] [ ] [5] |
| 231 | + * Step 0: Each process sends its own block to process r + l and receives another from r - l. |
| 232 | + * # 0 1 2 3 4 5 |
| 233 | + * [0] [ ] [ ] [ ] [0] [ ] |
| 234 | + * [ ] [1] [ ] [ ] [ ] [1] |
| 235 | + * [2] [ ] [2] [ ] [ ] [ ] |
| 236 | + * [ ] [3] [ ] [3] [ ] [ ] |
| 237 | + * [ ] [ ] [4] [ ] [4] [ ] |
| 238 | + * [ ] [ ] [ ] [5] [ ] [5] |
| 239 | + * Step 1: Each process sends its own block to process r + l/2 and receives another from r - l/2. |
| 240 | + * The block received on the previous step is ignored to avoid a future double-write. |
| 241 | + * # 0 1 2 3 4 5 |
| 242 | + * [0] [ ] [0] [ ] [0] [ ] |
| 243 | + * [ ] [1] [ ] [1] [ ] [1] |
| 244 | + * [2] [ ] [2] [ ] [2] [ ] |
| 245 | + * [ ] [3] [ ] [3] [ ] [3] |
| 246 | + * [4] [ ] [4] [ ] [4] [ ] |
| 247 | + * [ ] [5] [ ] [5] [ ] [5] |
| 248 | + * Step 1: Each process sends all the data it has (3 blocks) to process r + l/4 and similarly |
| 249 | + * receives all the data from process r - l/4. |
| 250 | + * # 0 1 2 3 4 5 |
| 251 | + * [0] [0] [0] [0] [0] [0] |
| 252 | + * [1] [1] [1] [1] [1] [1] |
| 253 | + * [2] [2] [2] [2] [2] [2] |
| 254 | + * [3] [3] [3] [3] [3] [3] |
| 255 | + * [4] [4] [4] [4] [4] [4] |
| 256 | + * [5] [5] [5] [5] [5] [5] |
| 257 | + */ |
| 258 | + |
| 259 | +int ompi_coll_base_allgatherv_intra_sparbit(const void *sbuf, int scount, |
| 260 | + struct ompi_datatype_t *sdtype, |
| 261 | + void* rbuf, const int *rcounts, |
| 262 | + const int *rdispls, |
| 263 | + struct ompi_datatype_t *rdtype, |
| 264 | + struct ompi_communicator_t *comm, |
| 265 | + mca_coll_base_module_t *module) |
| 266 | +{ |
| 267 | + /* ################# VARIABLE DECLARATION, BUFFER CREATION AND PREPARATION FOR THE ALGORITHM ######################## */ |
| 268 | + |
| 269 | + /* list of variable declaration */ |
| 270 | + int rank = 0, comm_size = 0, comm_log = 0, exclusion = 0; |
| 271 | + int data_expected = 1, transfer_count = 0, step_requests = 0; |
| 272 | + int sendto, recvfrom, send_disp, recv_disp; |
| 273 | + uint32_t last_ignore, ignore_steps, distance = 1; |
| 274 | + |
| 275 | + int err = 0; |
| 276 | + int line = -1; |
| 277 | + |
| 278 | + ptrdiff_t rlb, rext; |
| 279 | + |
| 280 | + char *tmpsend = NULL, *tmprecv = NULL; |
| 281 | + |
| 282 | + MPI_Request *requests = NULL; |
| 283 | + |
| 284 | + /* printf("utilizando o allgatherv novo!!\n"); */ |
| 285 | + |
| 286 | + /* algorithm choice information printing */ |
| 287 | + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, |
| 288 | + "coll:sparbit:allgather_sync_intra rank %d", rank)); |
| 289 | + |
| 290 | + comm_size = ompi_comm_size(comm); |
| 291 | + rank = ompi_comm_rank(comm); |
| 292 | + |
| 293 | + err = ompi_datatype_get_extent(rdtype, &rlb, &rext); |
| 294 | + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } |
| 295 | + |
| 296 | + /* if the MPI_IN_PLACE condition is not set, copy the send buffer to the receive buffer to perform the sends (all the data is extracted and forwarded from the recv buffer)*/ |
| 297 | + /* tmprecv and tmpsend are used as abstract pointers to simplify send and receive buffer choice */ |
| 298 | + tmprecv = (char *) rbuf; |
| 299 | + if(MPI_IN_PLACE != sbuf){ |
| 300 | + tmpsend = (char *) sbuf; |
| 301 | + err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv + (ptrdiff_t) rdispls[rank] * rext, scount, rdtype); |
| 302 | + if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } |
| 303 | + } |
| 304 | + tmpsend = tmprecv; |
| 305 | + |
| 306 | + requests = (MPI_Request *) malloc(comm_size * sizeof(MPI_Request)); |
| 307 | + |
| 308 | + /* ################# ALGORITHM LOGIC ######################## */ |
| 309 | + |
| 310 | + /* calculate log2 of the total process count */ |
| 311 | + comm_log = ceil(log(comm_size)/log(2)); |
| 312 | + distance <<= comm_log - 1; |
| 313 | + |
| 314 | + last_ignore = __builtin_ctz(comm_size); |
| 315 | + ignore_steps = (~((uint32_t) comm_size >> last_ignore) | 1) << last_ignore; |
| 316 | + |
| 317 | + /* perform the parallel binomial tree distribution steps */ |
| 318 | + for (int i = 0; i < comm_log; ++i) { |
| 319 | + sendto = (rank + distance) % comm_size; |
| 320 | + recvfrom = (rank - distance + comm_size) % comm_size; |
| 321 | + exclusion = (distance & ignore_steps) == distance; |
| 322 | + |
| 323 | + for (transfer_count = 0; transfer_count < data_expected - exclusion; transfer_count++) { |
| 324 | + send_disp = (rank - 2 * transfer_count * distance + comm_size) % comm_size; |
| 325 | + recv_disp = (rank - (2 * transfer_count + 1) * distance + comm_size) % comm_size; |
| 326 | + |
| 327 | + /* Since each process sends several non-contiguos blocks of data to the same destination, |
| 328 | + * each block sent (and therefore each send and recv call) needs a different tag. */ |
| 329 | + /* As base OpenMPI only provides one tag for allgather, we are forced to use a tag space |
| 330 | + * from other components in the send and recv calls */ |
| 331 | + if(rcounts[send_disp] > 0) |
| 332 | + MCA_PML_CALL(isend(tmpsend + (ptrdiff_t) rdispls[send_disp] * rext, rcounts[send_disp], rdtype, sendto, MCA_COLL_BASE_TAG_HCOLL_BASE - send_disp, MCA_PML_BASE_SEND_STANDARD, comm, requests + step_requests++)); |
| 333 | + if(rcounts[recv_disp] > 0) |
| 334 | + MCA_PML_CALL(irecv(tmprecv + (ptrdiff_t) rdispls[recv_disp] * rext, rcounts[recv_disp], rdtype, recvfrom, MCA_COLL_BASE_TAG_HCOLL_BASE - recv_disp, comm, requests + step_requests++)); |
| 335 | + } |
| 336 | + ompi_request_wait_all(step_requests, requests, MPI_STATUSES_IGNORE); |
| 337 | + |
| 338 | + distance >>= 1; |
| 339 | + /* calculates the data expected for the next step, based on the current number of blocks and eventual exclusions */ |
| 340 | + data_expected = (data_expected << 1) - exclusion; |
| 341 | + exclusion = step_requests = 0; |
| 342 | + } |
| 343 | + |
| 344 | + free(requests); |
| 345 | + |
| 346 | + return OMPI_SUCCESS; |
| 347 | + |
| 348 | +err_hndl: |
| 349 | + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", |
| 350 | + __FILE__, line, err, rank)); |
| 351 | + (void)line; // silence compiler warning |
| 352 | + return err; |
| 353 | + |
| 354 | +} |
205 | 355 |
|
206 | 356 | /*
|
207 | 357 | * ompi_coll_base_allgatherv_intra_ring
|
|
0 commit comments