@@ -139,7 +139,6 @@ template <typename R, typename Compare> void local_sort(R &r, Compare &&comp) {
139
139
template <typename Compare>
140
140
void _find_split_idx (std::size_t &vidx, std::size_t &segidx, Compare &&comp,
141
141
auto &ls, auto &vec_v, auto &vec_i, auto &vec_s) {
142
-
143
142
while (vidx < default_comm ().size () && segidx < rng::size (ls)) {
144
143
if (comp (vec_v[vidx - 1 ], ls[segidx])) {
145
144
vec_i[vidx] = segidx;
@@ -231,26 +230,26 @@ void splitters(Seg &lsegment, Compare &&comp,
231
230
}
232
231
233
232
template <typename valT>
234
- void shift_data (const int shift_left, const int shift_right,
233
+ void shift_data (const int64_t shift_left, const int64_t shift_right,
235
234
buffer<valT> &vec_recvdata, buffer<valT> &vec_left,
236
235
buffer<valT> &vec_right) {
237
-
238
236
const std::size_t _comm_rank = default_comm ().rank ();
239
237
240
238
MPI_Request req_l, req_r;
241
239
MPI_Status stat_l, stat_r;
242
240
243
- assert (static_cast <int >(rng::size (vec_left)) == std::max (0 , shift_left));
244
- assert (static_cast <int >(rng::size (vec_right)) == std::max (0 , shift_right));
241
+ assert (static_cast <int64_t >(rng::size (vec_left)) == std::max (0L , shift_left));
242
+ assert (static_cast <int64_t >(rng::size (vec_right)) ==
243
+ std::max (0L , shift_right));
245
244
246
- if (static_cast <int >(rng::size (vec_recvdata)) < -shift_left) {
245
+ if (static_cast <int64_t >(rng::size (vec_recvdata)) < -shift_left) {
247
246
// Too little data in recv buffer to shift left - first get from right,
248
247
// then send left
249
248
DRLOG (" Get from right first, recvdata size {} shift left {}" ,
250
249
rng::size (vec_recvdata), shift_left);
251
250
// ** This will never happen, because values eq to split go left **
252
251
assert (false );
253
- } else if (static_cast <int >(rng::size (vec_recvdata)) < -shift_right) {
252
+ } else if (static_cast <int64_t >(rng::size (vec_recvdata)) < -shift_right) {
254
253
// Too little data in buffer to shift right - first get from left, then
255
254
// send right
256
255
assert (shift_left > 0 );
@@ -280,26 +279,23 @@ void shift_data(const int shift_left, const int shift_right,
280
279
MPI_Wait (&req_r, &stat_r);
281
280
} else {
282
281
// enough data in recv buffer
283
-
284
282
if (shift_left < 0 ) {
285
283
default_comm ().isend (rng::data (vec_recvdata), -shift_left, _comm_rank - 1 ,
286
284
&req_l);
287
285
} else if (shift_left > 0 ) {
288
- assert (shift_left == static_cast <int >(rng::size (vec_left)));
286
+ assert (shift_left == static_cast <int64_t >(rng::size (vec_left)));
289
287
default_comm ().irecv (rng::data (vec_left), rng::size (vec_left),
290
288
_comm_rank - 1 , &req_l);
291
289
}
292
-
293
290
if (shift_right > 0 ) {
294
- assert (shift_right == static_cast <int >(rng::size (vec_right)));
291
+ assert (shift_right == static_cast <int64_t >(rng::size (vec_right)));
295
292
default_comm ().irecv (rng::data (vec_right), rng::size (vec_right),
296
293
_comm_rank + 1 , &req_r);
297
294
} else if (shift_right < 0 ) {
298
295
default_comm ().isend (rng::data (vec_recvdata) + rng::size (vec_recvdata) +
299
296
shift_right,
300
297
-shift_right, _comm_rank + 1 , &req_r);
301
298
}
302
-
303
299
if (shift_left != 0 )
304
300
MPI_Wait (&req_l, &stat_l);
305
301
if (shift_right != 0 )
@@ -308,11 +304,11 @@ void shift_data(const int shift_left, const int shift_right,
308
304
}
309
305
310
306
template <typename valT>
311
- void copy_results (auto &lsegment, const int shift_left, const int shift_right ,
312
- buffer<valT> &vec_recvdata , buffer<valT> &vec_left ,
313
- buffer<valT> &vec_right) {
314
- const std::size_t invalidate_left = std::max (-shift_left, 0 );
315
- const std::size_t invalidate_right = std::max (-shift_right, 0 );
307
+ void copy_results (auto &lsegment, const int64_t shift_left,
308
+ const int64_t shift_right , buffer<valT> &vec_recvdata ,
309
+ buffer<valT> &vec_left, buffer<valT> & vec_right) {
310
+ const std::size_t invalidate_left = std::max (-shift_left, 0L );
311
+ const std::size_t invalidate_right = std::max (-shift_right, 0L );
316
312
317
313
const std::size_t size_l = rng::size (vec_left);
318
314
const std::size_t size_r = rng::size (vec_right);
@@ -355,7 +351,6 @@ void copy_results(auto &lsegment, const int shift_left, const int shift_right,
355
351
356
352
template <dr::distributed_range R, typename Compare>
357
353
void dist_sort (R &r, Compare &&comp) {
358
-
359
354
using valT = typename R::value_type;
360
355
361
356
const std::size_t _comm_rank = default_comm ().rank ();
@@ -370,6 +365,8 @@ void dist_sort(R &r, Compare &&comp) {
370
365
std::vector<std::size_t > vec_recv_elems (_comm_size, 0 );
371
366
std::size_t _total_elems = 0 ;
372
367
368
+ DRLOG (" Rank {}: Dist sort, local segment size {}" , default_comm ().rank (),
369
+ rng::size (lsegment));
373
370
__detail::local_sort (lsegment, comp);
374
371
375
372
/* find splitting values - limits of areas to send to other processes */
@@ -383,12 +380,8 @@ void dist_sort(R &r, Compare &&comp) {
383
380
384
381
/* send and receive data belonging to each node, then redistribute
385
382
* data to achieve size of data equal to size of local segment */
386
-
387
- /* TODO: all_gather() below can be asynchronous - to be verified in CI
388
- * (currently hangs in CI unit tests, but going well when started manually)
389
- */
383
+ /* async all_gather causes problems on some systems */
390
384
// MPI_Request req_recvelems;
391
- // default_comm().i_all_gather(_recv_elems, vec_recv_elems, &req_recvelems);
392
385
default_comm ().all_gather (_recv_elems, vec_recv_elems);
393
386
394
387
/* buffer for received data */
@@ -402,13 +395,12 @@ void dist_sort(R &r, Compare &&comp) {
402
395
/* TODO: vec recvdata is partially sorted, implementation of merge on GPU is
403
396
* desirable */
404
397
__detail::local_sort (vec_recvdata, comp);
405
-
406
398
// MPI_Wait(&req_recvelems, MPI_STATUS_IGNORE);
407
399
408
400
_total_elems = std::reduce (vec_recv_elems.begin (), vec_recv_elems.end ());
409
401
410
402
/* prepare data for shift to neighboring processes */
411
- std::vector<int > vec_shift (_comm_size - 1 );
403
+ std::vector<int64_t > vec_shift (_comm_size - 1 );
412
404
413
405
const auto desired_elems_num = (_total_elems + _comm_size - 1 ) / _comm_size;
414
406
@@ -417,12 +409,12 @@ void dist_sort(R &r, Compare &&comp) {
417
409
vec_shift[_i] = vec_shift[_i - 1 ] + desired_elems_num - vec_recv_elems[_i];
418
410
}
419
411
420
- const int shift_left = _comm_rank == 0 ? 0 : -vec_shift[_comm_rank - 1 ];
421
- const int shift_right =
412
+ const int64_t shift_left = _comm_rank == 0 ? 0 : -vec_shift[_comm_rank - 1 ];
413
+ const int64_t shift_right =
422
414
_comm_rank == _comm_size - 1 ? 0 : vec_shift[_comm_rank];
423
415
424
- buffer<valT> vec_left (std::max (shift_left, 0 ));
425
- buffer<valT> vec_right (std::max (shift_right, 0 ));
416
+ buffer<valT> vec_left (std::max (shift_left, 0L ));
417
+ buffer<valT> vec_right (std::max (shift_right, 0L ));
426
418
427
419
/* shift data if necessary, to have exactly the number of elements equal to
428
420
* lsegment size */
@@ -432,7 +424,6 @@ void dist_sort(R &r, Compare &&comp) {
432
424
/* copy results to distributed vector's local segment */
433
425
__detail::copy_results<valT>(lsegment, shift_left, shift_right, vec_recvdata,
434
426
vec_left, vec_right);
435
-
436
427
} // __detail::dist_sort
437
428
438
429
} // namespace __detail
@@ -446,14 +437,15 @@ void sort(R &r, Compare &&comp = Compare()) {
446
437
std::size_t _comm_size = default_comm ().size (); // dr-style ignore
447
438
448
439
if (_comm_size == 1 ) {
440
+ DRLOG (" mhp::sort() - one node only" );
449
441
auto &&lsegment = local_segment (r);
450
442
__detail::local_sort (lsegment, comp);
451
443
452
444
} else if (rng::size (r) <= (_comm_size - 1 ) * (_comm_size - 1 )) {
453
445
/* Distributed vector of size <= (comm_size-1) * (comm_size-1) may have
454
446
* 0-size local segments. It is also small enough to prefer sequential sort
455
447
*/
456
- DRLOG (" mhp::sort() - local sort" );
448
+ DRLOG (" mhp::sort() - local sort on node 0 " );
457
449
458
450
std::vector<valT> vec_recvdata (rng::size (r));
459
451
dr::mhp::copy (0 , r, rng::begin (vec_recvdata));
0 commit comments