@@ -69,6 +69,10 @@ int mca_coll_han_alltoall_using_smsc(
69
69
{
70
70
71
71
mca_coll_han_module_t * han_module = (mca_coll_han_module_t * )module ;
72
+ opal_convertor_t convertor ;
73
+ int send_needs_bounce , have_device_buffer ;
74
+ size_t packed_size = 0 ;
75
+
72
76
73
77
OPAL_OUTPUT_VERBOSE ((90 , mca_coll_han_component .han_output ,
74
78
"Entering mca_coll_han_alltoall_using_smsc\n" ));
@@ -82,6 +86,44 @@ int mca_coll_han_alltoall_using_smsc(
82
86
comm , han_module -> previous_alltoall_module );
83
87
}
84
88
89
+ if (sbuf == MPI_IN_PLACE ) {
90
+ /* This is not an in-place algorithm */
91
+ return han_module -> previous_alltoall (sbuf , scount , sdtype , rbuf , rcount , rdtype ,
92
+ comm , han_module -> previous_alltoall_module );
93
+ }
94
+
95
+ OBJ_CONSTRUCT ( & convertor , opal_convertor_t );
96
+ send_needs_bounce = 0 ;
97
+ have_device_buffer = 0 ;
98
+ /* get converter for copying to one of the leader ranks, and get packed size: */
99
+ opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor , & sdtype -> super , scount , sbuf , 0 , & convertor );
100
+ have_device_buffer |= opal_convertor_on_device (& convertor );
101
+ send_needs_bounce |= opal_convertor_need_buffers (& convertor );
102
+ opal_convertor_cleanup (& convertor );
103
+
104
+ opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor , & rdtype -> super , rcount , rbuf , 0 , & convertor );
105
+ have_device_buffer |= opal_convertor_on_device (& convertor );
106
+ send_needs_bounce |= opal_convertor_need_buffers (& convertor );
107
+ opal_convertor_get_packed_size ( & convertor , & packed_size );
108
+ opal_convertor_cleanup (& convertor );
109
+
110
+ if (have_device_buffer ) {
111
+ /*
112
+ Although this algorithm is functional for device buffers, it requires an
113
+ extra copy through the bounce buffer that doesn't make it efficient.
114
+ Prefer another algorithm instead.
115
+
116
+ Note that Open MPI makes assumptions that if one rank uses a device
117
+ buffer in a collective, then all ranks will use device buffers, so there
118
+ is no need to communicate before taking this branch.
119
+ */
120
+ OBJ_DESTRUCT (& convertor );
121
+ return han_module -> previous_alltoall (sbuf , scount , sdtype , rbuf , rcount , rdtype ,
122
+ comm , han_module -> previous_alltoall_module );
123
+ }
124
+
125
+
126
+
85
127
/* Create the subcommunicators */
86
128
if ( OMPI_SUCCESS != mca_coll_han_comm_create_new (comm , han_module ) ) {
87
129
opal_output_verbose (1 , mca_coll_han_component .han_output ,
@@ -107,12 +149,11 @@ int mca_coll_han_alltoall_using_smsc(
107
149
comm , han_module -> previous_alltoall_module );
108
150
}
109
151
110
- int rc , send_needs_bounce , ii_push_data ;
152
+ int rc , ii_push_data ;
111
153
size_t sndsize ;
112
154
MPI_Aint sextent , rextent , lb ;
113
- char * send_bounce ;
114
- opal_convertor_t convertor ;
115
- size_t packed_size = 0 , packed_size_tmp ;
155
+ char * send_bounce = NULL ;
156
+ size_t packed_size_tmp ;
116
157
int use_isend ;
117
158
void * gather_buf_in [4 ];
118
159
int up_rank ;
@@ -140,22 +181,6 @@ int mca_coll_han_alltoall_using_smsc(
140
181
}
141
182
if (fanout > up_size ) { fanout = up_size ; }
142
183
143
- OBJ_CONSTRUCT ( & convertor , opal_convertor_t );
144
-
145
-
146
- send_needs_bounce = 0 ;
147
- /* get converter for copying to one of the leader ranks, and get packed size: */
148
- opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor , & sdtype -> super , scount , sbuf , 0 , & convertor );
149
- send_needs_bounce |= 0 != opal_convertor_on_device (& convertor );
150
- send_needs_bounce |= opal_convertor_need_buffers (& convertor );
151
- opal_convertor_cleanup (& convertor );
152
-
153
- opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor , & rdtype -> super , rcount , rbuf , 0 , & convertor );
154
- send_needs_bounce |= 0 != opal_convertor_on_device (& convertor );
155
- send_needs_bounce |= opal_convertor_need_buffers (& convertor );
156
- opal_convertor_get_packed_size ( & convertor , & packed_size );
157
- opal_convertor_cleanup (& convertor );
158
-
159
184
/*
160
185
Because push-mode needs extra synchronizations, we'd like to avoid it,
161
186
however it might be necessary:
0 commit comments