|
28 | 28 |
|
29 | 29 | static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, ptrdiff_t gap, const void *sendbuf,
|
30 | 30 | void *recvbuf, MPI_Op op, char inplace, NBC_Schedule *schedule, void *tmpbuf);
|
| 31 | +static inline int allred_sched_recursivedoubling(int rank, int p, const void *sendbuf, void *recvbuf, |
| 32 | + int count, MPI_Datatype datatype, ptrdiff_t gap, MPI_Op op, |
| 33 | + char inplace, NBC_Schedule *schedule, void *tmpbuf); |
31 | 34 | static inline int allred_sched_ring(int rank, int p, int count, MPI_Datatype datatype, const void *sendbuf,
|
32 | 35 | void *recvbuf, MPI_Op op, int size, int ext, NBC_Schedule *schedule,
|
33 | 36 | void *tmpbuf);
|
@@ -69,7 +72,7 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI
|
69 | 72 | #ifdef NBC_CACHE_SCHEDULE
|
70 | 73 | NBC_Allreduce_args *args, *found, search;
|
71 | 74 | #endif
|
72 |
| - enum { NBC_ARED_BINOMIAL, NBC_ARED_RING, NBC_ARED_REDSCAT_ALLGATHER } alg; |
| 75 | + enum { NBC_ARED_BINOMIAL, NBC_ARED_RING, NBC_ARED_REDSCAT_ALLGATHER, NBC_ARED_RDBL } alg; |
73 | 76 | char inplace;
|
74 | 77 | void *tmpbuf = NULL;
|
75 | 78 | ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module;
|
@@ -124,9 +127,11 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI
|
124 | 127 | alg = NBC_ARED_RING;
|
125 | 128 | else if (libnbc_iallreduce_algorithm == 2)
|
126 | 129 | alg = NBC_ARED_BINOMIAL;
|
127 |
| - else if (libnbc_iallreduce_algorithm == 3 && count >= nprocs_pof2 && ompi_op_is_commute(op)) { |
| 130 | + else if (libnbc_iallreduce_algorithm == 3 && count >= nprocs_pof2 && ompi_op_is_commute(op)) |
128 | 131 | alg = NBC_ARED_REDSCAT_ALLGATHER;
|
129 |
| - } else |
| 132 | + else if (libnbc_iallreduce_algorithm == 4) |
| 133 | + alg = NBC_ARED_RDBL; |
| 134 | + else |
130 | 135 | alg = NBC_ARED_RING;
|
131 | 136 | }
|
132 | 137 | #ifdef NBC_CACHE_SCHEDULE
|
@@ -159,6 +164,9 @@ static int nbc_allreduce_init(const void* sendbuf, void* recvbuf, int count, MPI
|
159 | 164 | case NBC_ARED_RING:
|
160 | 165 | res = allred_sched_ring(rank, p, count, datatype, sendbuf, recvbuf, op, size, ext, schedule, tmpbuf);
|
161 | 166 | break;
|
| 167 | + case NBC_ARED_RDBL: |
| 168 | + res = allred_sched_recursivedoubling(rank, p, sendbuf, recvbuf, count, datatype, gap, op, inplace, schedule, tmpbuf); |
| 169 | + break; |
162 | 170 | }
|
163 | 171 | }
|
164 | 172 |
|
@@ -470,6 +478,161 @@ static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype dat
|
470 | 478 | return OMPI_SUCCESS;
|
471 | 479 | }
|
472 | 480 |
|
| 481 | +/* |
| 482 | + * allred_sched_recursivedoubling |
| 483 | + * |
| 484 | + * Function: Recursive doubling algorithm for iallreduce operation |
| 485 | + * |
| 486 | + * Description: Implements recursive doubling algorithm for iallreduce. |
| 487 | + * The algorithm preserves order of operations so it can |
| 488 | + * be used both by commutative and non-commutative operations. |
| 489 | + * Schedule length: O(\log(p)) |
| 490 | + * Memory requirements: |
| 491 | + * Each process requires a temporary buffer: count * typesize = O(count) |
| 492 | + * |
| 493 | + * Example on 7 nodes: |
| 494 | + * Initial state |
| 495 | + * # 0 1 2 3 4 5 6 |
| 496 | + * [0] [1] [2] [3] [4] [5] [6] |
| 497 | + * Initial adjustment step for non-power of two nodes. |
| 498 | + * old rank 1 3 5 6 |
| 499 | + * new rank 0 1 2 3 |
| 500 | + * [0+1] [2+3] [4+5] [6] |
| 501 | + * Step 1 |
| 502 | + * old rank 1 3 5 6 |
| 503 | + * new rank 0 1 2 3 |
| 504 | + * [0+1+] [0+1+] [4+5+] [4+5+] |
| 505 | + * [2+3+] [2+3+] [6 ] [6 ] |
| 506 | + * Step 2 |
| 507 | + * old rank 1 3 5 6 |
| 508 | + * new rank 0 1 2 3 |
| 509 | + * [0+1+] [0+1+] [0+1+] [0+1+] |
| 510 | + * [2+3+] [2+3+] [2+3+] [2+3+] |
| 511 | + * [4+5+] [4+5+] [4+5+] [4+5+] |
| 512 | + * [6 ] [6 ] [6 ] [6 ] |
| 513 | + * Final adjustment step for non-power of two nodes |
| 514 | + * # 0 1 2 3 4 5 6 |
| 515 | + * [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] |
| 516 | + * [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] |
| 517 | + * [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] |
| 518 | + * [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] |
| 519 | + * |
| 520 | + */ |
| 521 | +static inline int allred_sched_recursivedoubling(int rank, int p, const void *sendbuf, void *recvbuf, |
| 522 | + int count, MPI_Datatype datatype, ptrdiff_t gap, MPI_Op op, |
| 523 | + char inplace, NBC_Schedule *schedule, void *tmpbuf) |
| 524 | +{ |
| 525 | + int res, pof2, nprocs_rem, vrank; |
| 526 | + char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL; |
| 527 | + |
| 528 | + tmpsend = (char*) tmpbuf - gap; |
| 529 | + tmprecv = (char*) recvbuf; |
| 530 | + |
| 531 | + if (inplace) { |
| 532 | + res = NBC_Sched_copy(recvbuf, false, count, datatype, |
| 533 | + tmpsend, false, count, datatype, schedule, true); |
| 534 | + } else { |
| 535 | + res = NBC_Sched_copy((void *)sendbuf, false, count, datatype, |
| 536 | + tmpsend, false, count, datatype, schedule, true); |
| 537 | + } |
| 538 | + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } |
| 539 | + |
| 540 | + /* Get nearest power of two less than or equal to comm size */ |
| 541 | + pof2 = opal_next_poweroftwo(p) >> 1; |
| 542 | + |
| 543 | + /* Handle non-power-of-two case: |
| 544 | + - Even ranks less than 2 * nprocs_rem send their data to (rank + 1), and |
| 545 | + sets new rank to -1. |
| 546 | + - Odd ranks less than 2 * nprocs_rem receive data from (rank - 1), |
| 547 | + apply appropriate operation, and set new rank to rank/2 |
| 548 | + - Everyone else sets rank to rank - nprocs_rem |
| 549 | + */ |
| 550 | + nprocs_rem = p - pof2; |
| 551 | + if (rank < 2 * nprocs_rem) { |
| 552 | + if (0 == rank % 2) { /* Even */ |
| 553 | + res = NBC_Sched_send(tmpsend, false, count, datatype, rank + 1, schedule, true); |
| 554 | + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } |
| 555 | + vrank = -1; |
| 556 | + } else { /* Odd */ |
| 557 | + res = NBC_Sched_recv(tmprecv, false, count, datatype, rank - 1, schedule, true); |
| 558 | + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } |
| 559 | + |
| 560 | + /* tmpsend = tmprecv (op) tmpsend */ |
| 561 | + res = NBC_Sched_op(tmprecv, false, tmpsend, false, count, datatype, op, schedule, true); |
| 562 | + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } |
| 563 | + |
| 564 | + vrank = rank >> 1; |
| 565 | + } |
| 566 | + } else { |
| 567 | + vrank = rank - nprocs_rem; |
| 568 | + } |
| 569 | + |
| 570 | + /* Communication/Computation loop |
| 571 | + - Exchange message with remote node. |
| 572 | + - Perform appropriate operation taking in account order of operations: |
| 573 | + result = value (op) result |
| 574 | + */ |
| 575 | + if (0 <= vrank) { |
| 576 | + for (int distance = 1; distance < pof2; distance <<= 1) { |
| 577 | + int remote = vrank ^ distance; |
| 578 | + |
| 579 | + /* Find real rank of remote node */ |
| 580 | + if (remote < nprocs_rem) { |
| 581 | + remote = remote * 2 + 1; |
| 582 | + } else { |
| 583 | + remote += nprocs_rem; |
| 584 | + } |
| 585 | + |
| 586 | + /* Exchange the data */ |
| 587 | + res = NBC_Sched_send(tmpsend, false, count, datatype, remote, schedule, false); |
| 588 | + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } |
| 589 | + |
| 590 | + res = NBC_Sched_recv(tmprecv, false, count, datatype, remote, schedule, true); |
| 591 | + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } |
| 592 | + |
| 593 | + /* Apply operation */ |
| 594 | + if (rank < remote) { |
| 595 | + /* tmprecv = tmpsend (op) tmprecv */ |
| 596 | + res = NBC_Sched_op(tmpsend, false, tmprecv, false, |
| 597 | + count, datatype, op, schedule, true); |
| 598 | + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } |
| 599 | + |
| 600 | + /* Swap tmpsend and tmprecv buffers */ |
| 601 | + tmpswap = tmprecv; tmprecv = tmpsend; tmpsend = tmpswap; |
| 602 | + } else { |
| 603 | + /* tmpsend = tmprecv (op) tmpsend */ |
| 604 | + res = NBC_Sched_op(tmprecv, false, tmpsend, false, |
| 605 | + count, datatype, op, schedule, true); |
| 606 | + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } |
| 607 | + } |
| 608 | + } |
| 609 | + } |
| 610 | + |
| 611 | + /* Handle non-power-of-two case: |
| 612 | + - Even ranks less than 2 * nprocs_rem receive result from (rank + 1) |
| 613 | + - Odd ranks less than 2 * nprocs_rem send result from tmpsend to (rank - 1) |
| 614 | + */ |
| 615 | + if (rank < 2 * nprocs_rem) { |
| 616 | + if (0 == rank % 2) { /* Even */ |
| 617 | + res = NBC_Sched_recv(recvbuf, false, count, datatype, rank + 1, schedule, false); |
| 618 | + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } |
| 619 | + tmpsend = (char *)recvbuf; |
| 620 | + } else { /* Odd */ |
| 621 | + res = NBC_Sched_send(tmpsend, false, count, datatype, rank - 1, schedule, false); |
| 622 | + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } |
| 623 | + } |
| 624 | + } |
| 625 | + |
| 626 | + /* Copy result back into recvbuf */ |
| 627 | + if (tmpsend != recvbuf) { |
| 628 | + res = NBC_Sched_copy(tmpsend, false, count, datatype, |
| 629 | + recvbuf, false, count, datatype, schedule, false); |
| 630 | + if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } |
| 631 | + } |
| 632 | + |
| 633 | + return OMPI_SUCCESS; |
| 634 | +} |
| 635 | + |
473 | 636 | static inline int allred_sched_ring (int r, int p, int count, MPI_Datatype datatype, const void *sendbuf, void *recvbuf, MPI_Op op,
|
474 | 637 | int size, int ext, NBC_Schedule *schedule, void *tmpbuf) {
|
475 | 638 | int segsize, *segsizes, *segoffsets; /* segment sizes and offsets per segment (number of segments == number of nodes */
|
@@ -1044,4 +1207,3 @@ int ompi_coll_libnbc_allreduce_inter_init(const void* sendbuf, void* recvbuf, in
|
1044 | 1207 |
|
1045 | 1208 | return OMPI_SUCCESS;
|
1046 | 1209 | }
|
1047 |
| - |
|
0 commit comments