Skip to content

Commit 304cf97

Browse files
author
Thananon Patinyasakdikul
authored
Merge pull request #5334 from thananon/ofi_progress_fix
btl/ofi: progress now happens after a threshold.
2 parents c1ccbec + be76896 commit 304cf97

File tree

6 files changed

+36
-21
lines changed

6 files changed

+36
-21
lines changed

opal/mca/btl/ofi/btl_ofi.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@
4848
BEGIN_C_DECLS
4949
#define MCA_BTL_OFI_MAX_MODULES 16
5050
#define MCA_BTL_OFI_MAX_CQ_READ_ENTRIES 128
51+
#define MCA_BTL_OFI_NUM_CQE_READ 64
52+
#define MCA_BTL_OFI_PROGRESS_THRESHOLD 64
5153

5254
#define MCA_BTL_OFI_ABORT(args) mca_btl_ofi_exit(args)
5355

@@ -129,6 +131,7 @@ struct mca_btl_ofi_component_t {
129131
int module_count;
130132
int num_contexts_per_module;
131133
int num_cqe_read;
134+
int progress_threshold;
132135

133136
size_t namelen;
134137

opal/mca/btl/ofi/btl_ofi_atomics.c

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,6 @@ int mca_btl_ofi_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_end
8181

8282
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
8383

84-
/* force a bit of progress. */
85-
mca_btl_ofi_component.super.btl_progress();
86-
8784
return OPAL_SUCCESS;
8885
}
8986

@@ -135,7 +132,6 @@ int mca_btl_ofi_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
135132
}
136133

137134
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
138-
mca_btl_ofi_component.super.btl_progress();
139135

140136
return OPAL_SUCCESS;
141137
}
@@ -192,8 +188,5 @@ int mca_btl_ofi_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_e
192188

193189
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
194190

195-
/* force a bit of progress. */
196-
mca_btl_ofi_component.super.btl_progress();
197-
198191
return OPAL_SUCCESS;
199192
}

opal/mca/btl/ofi/btl_ofi_component.c

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,23 +49,34 @@ static int validate_info(struct fi_info *info)
4949
{
5050
int mr_mode;
5151

52+
BTL_VERBOSE(("validating device: %s", info->domain_attr->name));
53+
5254
/* we need exactly all the required bits */
5355
if ((info->caps & MCA_BTL_OFI_REQUIRED_CAPS) != MCA_BTL_OFI_REQUIRED_CAPS) {
56+
BTL_VERBOSE(("unsupported caps"));
5457
return OPAL_ERROR;
5558
}
5659

5760
/* we need FI_EP_RDM */
5861
if (info->ep_attr->type != FI_EP_RDM) {
62+
BTL_VERBOSE(("unsupported EP type"));
5963
return OPAL_ERROR;
6064
}
6165

6266
mr_mode = info->domain_attr->mr_mode;
6367

6468
if (!(mr_mode == FI_MR_BASIC || mr_mode == FI_MR_SCALABLE ||
6569
(mr_mode & ~(FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY)) == 0)) {
70+
BTL_VERBOSE(("unsupported MR mode"));
71+
return OPAL_ERROR;
72+
}
73+
74+
if (!(info->tx_attr->op_flags | FI_DELIVERY_COMPLETE)) {
75+
BTL_VERBOSE(("the endpoint tx_ctx does not support FI_DELIVERY_COMPLETE"));
6676
return OPAL_ERROR;
6777
}
6878

79+
BTL_VERBOSE(("device: %s is good to go.", info->domain_attr->name));
6980
return OPAL_SUCCESS;
7081
}
7182

@@ -102,14 +113,10 @@ static int mca_btl_ofi_component_register(void)
102113
MCA_BASE_VAR_SCOPE_READONLY,
103114
&prov_exclude);
104115

105-
/* Note: better leave it at 1 for now. osc rdma module is designed for 1 completion
106-
* at a time. Dealing with more than 1 completion in 1 read will confuse the osc rdma.
107-
* source: 8 hours of debugging. :(*/
108-
mca_btl_ofi_component.num_cqe_read = 1;
116+
mca_btl_ofi_component.num_cqe_read = MCA_BTL_OFI_NUM_CQE_READ;
109117
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
110118
"num_cq_read",
111-
"Number of completion entries to read from a single cq_read. "
112-
"(default: 1)",
119+
"Number of completion entries to read from a single cq_read. ",
113120
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
114121
OPAL_INFO_LVL_5,
115122
MCA_BASE_VAR_SCOPE_READONLY,
@@ -135,6 +142,7 @@ static int mca_btl_ofi_component_register(void)
135142
OPAL_INFO_LVL_5,
136143
MCA_BASE_VAR_SCOPE_READONLY,
137144
&mca_btl_ofi_component.num_contexts_per_module);
145+
138146
disable_sep = false;
139147
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
140148
"disable_sep",
@@ -144,6 +152,17 @@ static int mca_btl_ofi_component_register(void)
144152
MCA_BASE_VAR_SCOPE_READONLY,
145153
&disable_sep);
146154

155+
mca_btl_ofi_component.progress_threshold = MCA_BTL_OFI_PROGRESS_THRESHOLD;
156+
(void) mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version,
157+
"progress_threshold",
158+
"number of outstanding operation before btl will progress "
159+
"automatically. Tuning this might improve performance on "
160+
"certain type of application.",
161+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
162+
OPAL_INFO_LVL_5,
163+
MCA_BASE_VAR_SCOPE_READONLY,
164+
&mca_btl_ofi_component.progress_threshold);
165+
147166
/* for now we want this component to lose to btl/ugni and btl/vader */
148167
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50;
149168

@@ -241,6 +260,8 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
241260
tx_attr.iov_limit = 1;
242261
rx_attr.iov_limit = 1;
243262

263+
tx_attr.op_flags = FI_DELIVERY_COMPLETE;
264+
244265
mca_btl_ofi_component.module_count = 0;
245266

246267
/* do the query. */

opal/mca/btl/ofi/btl_ofi_endpoint.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info,
176176
struct fi_rx_attr rx_attr = {0};
177177

178178
mca_btl_ofi_context_t *contexts;
179+
tx_attr.op_flags = FI_DELIVERY_COMPLETE;
179180

180181
contexts = (mca_btl_ofi_context_t*) calloc(num_contexts, sizeof(*contexts));
181182
if (NULL == contexts) {

opal/mca/btl/ofi/btl_ofi_rdma.c

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,6 @@ int mca_btl_ofi_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
9595

9696
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
9797

98-
/* force a bit of progress */
99-
mca_btl_ofi_component.super.btl_progress();
100-
10198
return OPAL_SUCCESS;
10299
}
103100

@@ -143,9 +140,6 @@ int mca_btl_ofi_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoi
143140

144141
MCA_BTL_OFI_NUM_RDMA_INC(ofi_btl);
145142

146-
/* force a bit of progress */
147-
mca_btl_ofi_component.super.btl_progress();
148-
149143
return OPAL_SUCCESS;
150144

151145
}

opal/mca/btl/ofi/btl_ofi_rdma.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,11 @@ mca_btl_ofi_completion_t *mca_btl_ofi_completion_alloc (
2929
void *cbcontext, void *cbdata,
3030
int type);
3131

32-
#define MCA_BTL_OFI_NUM_RDMA_INC(module) \
33-
OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, 1);
32+
#define MCA_BTL_OFI_NUM_RDMA_INC(module) \
33+
OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, 1); \
34+
if (module->outstanding_rdma > mca_btl_ofi_component.progress_threshold){ \
35+
mca_btl_ofi_component.super.btl_progress(); \
36+
}
3437

3538
#define MCA_BTL_OFI_NUM_RDMA_DEC(module) \
3639
OPAL_THREAD_ADD_FETCH64(&(module)->outstanding_rdma, -1);

0 commit comments

Comments
 (0)