Skip to content

Commit 0c4ff68

Browse files
committed
mca/coll: add reduce method (knomial)
use tree method to do reduce; the tree can be knomial/kary/binomial/binary. This method use knomial by default, the radix is a user configuration and the default radix is 4 for intranode; for internode, a large k may be beneficial to performance. Signed-off-by: Jun Tang <juntangc@amazon.com>
1 parent 79df463 commit 0c4ff68

File tree

4 files changed

+175
-2
lines changed

4 files changed

+175
-2
lines changed

ompi/mca/coll/base/coll_base_functions.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_ou
275275
int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
276276
int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
277277
int ompi_coll_base_reduce_intra_redscat_gather(REDUCE_ARGS);
278+
int ompi_coll_base_reduce_intra_knomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, int radix);
278279

279280
/* Reduce_scatter */
280281
int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);

ompi/mca/coll/base/coll_base_reduce.c

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,3 +1142,169 @@ int ompi_coll_base_reduce_intra_redscat_gather(
11421142
free(scount);
11431143
return err;
11441144
}
1145+
1146+
/*
1147+
* ompi_coll_base_reduce_intra_knomial
1148+
*
1149+
* Function: reduce using k-nomial tree algorithm
1150+
* Accepts: Same arguments as MPI_Reduce, plus radix
1151+
* Returns: MPI_SUCCESS or error code
1152+
* Parameters: radix -- k-nomial tree radix (>= 2)
1153+
*
1154+
* Time complexity: (radix - 1)O(\log_{radix}(comm_size))
1155+
*
1156+
* Example, comm_size=10
1157+
* radix=2 radix=3 radix=4
1158+
* 0 0 0
1159+
* / / \ \ / / | \ \ / / \ \ \
1160+
* 8 4 2 1 9 3 6 1 2 4 8 1 2 3
1161+
* | |\ | |\ |\ /|\ |
1162+
* 9 6 5 3 4 5 7 8 5 6 7 9
1163+
* |
1164+
* 7
1165+
*/
1166+
int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
1167+
int count, ompi_datatype_t* datatype,
1168+
ompi_op_t* op, int root,
1169+
ompi_communicator_t* comm,
1170+
mca_coll_base_module_t *module,
1171+
uint32_t segsize,
1172+
int max_outstanding_reqs, int radix)
1173+
{
1174+
int err = OMPI_SUCCESS, rank, line;
1175+
ptrdiff_t extent, lb;
1176+
size_t dtype_size;
1177+
char *child_buf = NULL;
1178+
char *child_buf_start = NULL;
1179+
char *reduce_buf = NULL;
1180+
char *reduce_buf_start = NULL;
1181+
char *sendtmpbuf = NULL;
1182+
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
1183+
mca_coll_base_comm_t *data = base_module->base_data;
1184+
ompi_coll_tree_t* tree;
1185+
int num_children;
1186+
bool is_leaf;
1187+
ptrdiff_t buf_size, gap = 0;
1188+
int max_reqs = 0, num_reqs;
1189+
ompi_request_t **reqs;
1190+
1191+
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:ompi_coll_base_reduce_intra_knomial msg size %d, max_requests %d",
1192+
count, max_outstanding_reqs));
1193+
1194+
rank = ompi_comm_rank(comm);
1195+
1196+
// create a k-nomial tree with radix 4
1197+
COLL_BASE_UPDATE_KMTREE(comm, base_module, root, radix);
1198+
if (NULL == data->cached_kmtree) {
1199+
// fail to create knomial tree fallback to previous allreduce method
1200+
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
1201+
"REDUCE: failed to create knomial tree. \n"));
1202+
goto err_hndl;
1203+
}
1204+
1205+
tree = data->cached_kmtree;
1206+
num_children = tree->tree_nextsize;
1207+
is_leaf = (tree->tree_nextsize == 0) ? true : false;
1208+
1209+
ompi_datatype_get_extent(datatype, &lb, &extent);
1210+
ompi_datatype_type_size(datatype, &dtype_size);
1211+
1212+
sendtmpbuf = (char*) sendbuf;
1213+
if( sendbuf == MPI_IN_PLACE ) {
1214+
sendtmpbuf = (char *)recvbuf;
1215+
}
1216+
buf_size = opal_datatype_span(&datatype->super, (int64_t)count, &gap);
1217+
reduce_buf = (char *)malloc(buf_size);
1218+
reduce_buf_start = reduce_buf - gap;
1219+
err = ompi_datatype_copy_content_same_ddt(datatype, count,
1220+
(char*)reduce_buf_start,
1221+
(char*)sendtmpbuf);
1222+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
1223+
1224+
// do transfer in a single transaction instead of segments
1225+
num_reqs = 0;
1226+
max_reqs = num_children;
1227+
if(!is_leaf) {
1228+
buf_size = opal_datatype_span(&datatype->super, (int64_t)count * num_children, &gap);
1229+
child_buf = (char *)malloc(buf_size);
1230+
child_buf_start = child_buf - gap;
1231+
reqs = ompi_coll_base_comm_get_reqs(data, max_reqs);
1232+
}
1233+
1234+
for (int i = 0; i < num_children; i++) {
1235+
int child = tree->tree_next[i];
1236+
err = MCA_PML_CALL(irecv(child_buf_start + (ptrdiff_t)i * count * extent,
1237+
count,
1238+
datatype,
1239+
child,
1240+
MCA_COLL_BASE_TAG_REDUCE,
1241+
comm,
1242+
&reqs[num_reqs++]));
1243+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
1244+
}
1245+
1246+
if (num_reqs > 0) {
1247+
err = ompi_request_wait_all(num_reqs, reqs, MPI_STATUS_IGNORE);
1248+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
1249+
}
1250+
1251+
for (int i = 0; i < num_children; i++) {
1252+
ompi_op_reduce(op,
1253+
child_buf_start + (ptrdiff_t)i * count * extent,
1254+
reduce_buf,
1255+
count,
1256+
datatype);
1257+
}
1258+
1259+
if (rank != root) {
1260+
err = MCA_PML_CALL(send(reduce_buf_start,
1261+
count,
1262+
datatype,
1263+
tree->tree_prev,
1264+
MCA_COLL_BASE_TAG_REDUCE,
1265+
MCA_PML_BASE_SEND_STANDARD,
1266+
comm));
1267+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
1268+
}
1269+
1270+
if (rank == root) {
1271+
err = ompi_datatype_copy_content_same_ddt(datatype, count,
1272+
(char*)recvbuf,
1273+
(char*)reduce_buf_start);
1274+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
1275+
}
1276+
1277+
if (NULL != child_buf) free(child_buf);
1278+
if (NULL != reduce_buf) free(reduce_buf);
1279+
return MPI_SUCCESS;
1280+
1281+
err_hndl:
1282+
if (NULL != child_buf) {
1283+
free(child_buf);
1284+
child_buf = NULL;
1285+
child_buf_start = NULL;
1286+
}
1287+
if (NULL != reduce_buf) {
1288+
free(reduce_buf);
1289+
reduce_buf = NULL;
1290+
reduce_buf_start = NULL;
1291+
}
1292+
if( NULL != reqs ) {
1293+
if (MPI_ERR_IN_STATUS == err) {
1294+
for( num_reqs = 0; num_reqs < tree->tree_nextsize; num_reqs++ ) {
1295+
if (MPI_REQUEST_NULL == reqs[num_reqs]) continue;
1296+
if (MPI_ERR_PENDING == reqs[num_reqs]->req_status.MPI_ERROR) continue;
1297+
if (reqs[num_reqs]->req_status.MPI_ERROR != MPI_SUCCESS) {
1298+
err = reqs[num_reqs]->req_status.MPI_ERROR;
1299+
break;
1300+
}
1301+
}
1302+
}
1303+
ompi_coll_base_free_reqs(reqs, max_reqs);
1304+
}
1305+
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
1306+
__FILE__, line, err, rank));
1307+
(void)line; // silence compiler warning
1308+
return err;
1309+
1310+
}

ompi/mca/coll/tuned/coll_tuned_decision_fixed.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -808,9 +808,10 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( const void *sendbuf, void *recvbuf,
808808
}
809809
}
810810

811+
int faninout = 2;
811812
return ompi_coll_tuned_reduce_intra_do_this (sendbuf, recvbuf, count, datatype,
812813
op, root, comm, module,
813-
alg, 0, 0, 0);
814+
alg, faninout, 0, 0);
814815
}
815816

816817
/*

ompi/mca/coll/tuned/coll_tuned_reduce_decision.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ static const mca_base_var_enum_value_t reduce_algorithms[] = {
4242
{5, "binomial"},
4343
{6, "in-order_binary"},
4444
{7, "rabenseifner"},
45+
{8, "knomial"},
4546
{0, NULL}
4647
};
4748

@@ -80,7 +81,7 @@ int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_m
8081
mca_param_indices->algorithm_param_index =
8182
mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
8283
"reduce_algorithm",
83-
"Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline, 4 binary, 5 binomial, 6 in-order binary, 7 rabenseifner. "
84+
"Which reduce algorithm is used. Can be locked down to choice of: 0 ignore, 1 linear, 2 chain, 3 pipeline, 4 binary, 5 binomial, 6 in-order binary, 7 rabenseifner, 8 knomial. "
8485
"Only relevant if coll_tuned_use_dynamic_rules is true.",
8586
MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
8687
OPAL_INFO_LVL_5,
@@ -177,6 +178,10 @@ int ompi_coll_tuned_reduce_intra_do_this(const void *sbuf, void* rbuf, int count
177178
segsize, max_requests);
178179
case (7): return ompi_coll_base_reduce_intra_redscat_gather(sbuf, rbuf, count, dtype,
179180
op, root, comm, module);
181+
case (8): return ompi_coll_base_reduce_intra_knomial(sbuf, rbuf, count, dtype,
182+
op, root, comm, module,
183+
segsize, max_requests,
184+
faninout);
180185
} /* switch */
181186
OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
182187
algorithm, ompi_coll_tuned_forced_max_algorithms[REDUCE]));

0 commit comments

Comments
 (0)