Skip to content

Commit be76896

Browse files
author
Thananon Patinyasakdikul
committed
btl/ofi: progress now happens after a threshold.
This commit changed the way btl/ofi call progress. Before, we force progression with every rdma/atomic call. This gives performance boost in some case and slow down on others. Now we only force progression after some number of rdma calls which result in better performance overall. Also added new MCA parameter 'mca_btl_ofi_progress_threshold' to set the threshold number. The new default is 64. Also: Added FI_DELIVERY_COMPLETE to tx_rtx flags to ensure that the completion is generated after the message has been received on the remote side. Signed-off-by: Thananon Patinyasakdikul <thananon.patinyasakdikul@intel.com>
1 parent e59f58a commit be76896

File tree

6 files changed

+36
-21
lines changed

6 files changed

+36
-21
lines changed

opal/mca/btl/ofi/btl_ofi.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@
4848
BEGIN_C_DECLS
4949
#define MCA_BTL_OFI_MAX_MODULES 16
5050
#define MCA_BTL_OFI_MAX_CQ_READ_ENTRIES 128
51+
#define MCA_BTL_OFI_NUM_CQE_READ 64
52+
#define MCA_BTL_OFI_PROGRESS_THRESHOLD 64
5153

5254
#define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args)
5355

@@ -129,6 +131,7 @@ struct mca_btl_ofi_component_t {
129131
int module_count;
130132
int num_contexts_per_module;
131133
int num_cqe_read;
134+
int progress_threshold;
132135

133136
size_t namelen;
134137

opal/mca/btl/ofi/btl_ofi_atomics.c

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,6 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
8181

8282
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
8383

84-
/* force a bit of progress. */
85-
mca_btl_ofi_component.super.btl_progress();
86-
8784
return OPAL_SUCCESS;
8885
}
8986

@@ -135,7 +132,6 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
135132
}
136133

137134
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
138-
mca_btl_ofi_component.super.btl_progress();
139135

140136
return OPAL_SUCCESS;
141137
}
@@ -192,8 +188,5 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
192188

193189
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
194190

195-
/* force a bit of progress. */
196-
mca_btl_ofi_component.super.btl_progress();
197-
198191
return OPAL_SUCCESS;
199192
}

opal/mca/btl/ofi/btl_ofi_component.c

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,23 +49,34 @@ static int validate_info(struct fi_info *info)
4949
{
5050
int mr_mode;
5151

52+
BTL_VERBOSE(("validating device: %s", info->domain_attr->name));
53+
5254
/* we need exactly all the required bits */
5355
if ((info->caps & MCA_BTL_OFI_REQUIRED_CAPS) != MCA_BTL_OFI_REQUIRED_CAPS) {
56+
BTL_VERBOSE(("unsupported caps"));
5457
return OPAL_ERROR;
5558
}
5659

5760
/* we need FI_EP_RDM */
5861
if (info->ep_attr->type != FI_EP_RDM) {
62+
BTL_VERBOSE(("unsupported EP type"));
5963
return OPAL_ERROR;
6064
}
6165

6266
mr_mode = info->domain_attr->mr_mode;
6367

6468
if (!(mr_mode == FI_MR_BASIC || mr_mode == FI_MR_SCALABLE ||
6569
(mr_mode & ~(FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY)) == 0)) {
70+
BTL_VERBOSE(("unsupported MR mode"));
71+
return OPAL_ERROR;
72+
}
73+
74+
if (!(info->tx_attr->op_flags | FI_DELIVERY_COMPLETE)) {
75+
BTL_VERBOSE(("the endpoint tx_ctx does not support FI_DELIVERY_COMPLETE"));
6676
return OPAL_ERROR;
6777
}
6878

79+
BTL_VERBOSE(("device: %s is good to go.", info->domain_attr->name));
6980
return OPAL_SUCCESS;
7081
}
7182

@@ -102,14 +113,10 @@ static int mca_btl_ofi_component_register(void)
102113
MCA_BASE_VAR_SCOPE_READONLY,
103114
&prov_exclude);
104115

105-
/* Note: better leave it at 1 for now. osc rdma module is designed for 1 completion
106-
* at a time. Dealing with more than 1 completion in 1 read will confuse the osc rdma.
107-
* source: 8 hours of debugging. :(*/
108-
mca_btl_ofi_component.num_cqe_read = 1;
116+
mca_btl_ofi_component.num_cqe_read = MCA_BTL_OFI_NUM_CQE_READ;
109117
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
110118
"num_cq_read",
111-
"Number of completion entries to read from a single cq_read. "
112-
"(default: 1)",
119+
"Number of completion entries to read from a single cq_read. ",
113120
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
114121
OPAL_INFO_LVL_5,
115122
MCA_BASE_VAR_SCOPE_READONLY,
@@ -135,6 +142,7 @@ static int mca_btl_ofi_component_register(void)
135142
OPAL_INFO_LVL_5,
136143
MCA_BASE_VAR_SCOPE_READONLY,
137144
&mca_btl_ofi_component.num_contexts_per_module);
145+
138146
disable_sep = false;
139147
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
140148
"disable_sep",
@@ -144,6 +152,17 @@ static int mca_btl_ofi_component_register(void)
144152
MCA_BASE_VAR_SCOPE_READONLY,
145153
&disable_sep);
146154

155+
mca_btl_ofi_component.progress_threshold = MCA_BTL_OFI_PROGRESS_THRESHOLD;
156+
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
157+
"progress_threshold",
158+
"number of outstanding operation before btl will progress "
159+
"automatically. Tuning this might improve performance on "
160+
"certain type of application.",
161+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
162+
OPAL_INFO_LVL_5,
163+
MCA_BASE_VAR_SCOPE_READONLY,
164+
&mca_btl_ofi_component.progress_threshold);
165+
147166
/* for now we want this component to lose to btl/ugni and btl/vader */
148167
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50;
149168

@@ -241,6 +260,8 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
241260
tx_attr.iov_limit = 1;
242261
rx_attr.iov_limit = 1;
243262

263+
tx_attr.op_flags = FI_DELIVERY_COMPLETE;
264+
244265
mca_btl_ofi_component.module_count = 0;
245266

246267
/* do the query. */

opal/mca/btl/ofi/btl_ofi_endpoint.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info,
176176
struct fi_rx_attr rx_attr = {0};
177177

178178
mca_btl_ofi_context_t *contexts;
179+
tx_attr.op_flags = FI_DELIVERY_COMPLETE;
179180

180181
contexts = (mca_btl_ofi_context_t*) calloc(num_contexts, sizeof(*contexts));
181182
if (NULL == contexts) {

opal/mca/btl/ofi/btl_ofi_rdma.c

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,6 @@ int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
9595

9696
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
9797

98-
/* force a bit of progress */
99-
mca_btl_ofi_component.super.btl_progress();
100-
10198
return OPAL_SUCCESS;
10299
}
103100

@@ -143,9 +140,6 @@ int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
143140

144141
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
145142

146-
/* force a bit of progress */
147-
mca_btl_ofi_component.super.btl_progress();
148-
149143
return OPAL_SUCCESS;
150144

151145
}

opal/mca/btl/ofi/btl_ofi_rdma.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,11 @@ mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc (
2929
void *cbcontext, void *cbdata,
3030
int type);
3131

32-
#define MCA_BTL_OFI_NUM_RDMA_INC(module) \
33-
OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, 1);
32+
#define MCA_BTL_OFI_NUM_RDMA_INC(module) \
33+
OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, 1); \
34+
if (module->outstanding_rdma > mca_btl_ofi_component.progress_threshold){ \
35+
mca_btl_ofi_component.super.btl_progress(); \
36+
}
3437

3538
#define MCA_BTL_OFI_NUM_RDMA_DEC(module) \
3639
OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, -1);

0 commit comments

Comments
 (0)