Skip to content

Commit eeb3d7f

Browse files
authored
Merge pull request #7387 from hjelmn/osc_rdma_allow_overlapping_registration_regions_and_return_the_correct_error_code_when_regions_overlap
osc/rdma: modify attach to check for region overlap
2 parents 3b7fd5a + 54c8233 commit eeb3d7f

File tree

5 files changed

+184
-68
lines changed

5 files changed

+184
-68
lines changed

ompi/mca/osc/rdma/osc_rdma.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
1515
* Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
16+
* Copyright (c) 2020 Google, LLC. All rights reserved.
1617
* $COPYRIGHT$
1718
*
1819
* Additional copyrights may follow
@@ -250,7 +251,7 @@ struct ompi_osc_rdma_module_t {
250251

251252
/** registration handles for dynamically attached regions. These are not stored
252253
* in the state structure as it is entirely local. */
253-
ompi_osc_rdma_handle_t *dynamic_handles;
254+
ompi_osc_rdma_handle_t **dynamic_handles;
254255

255256
/** shared memory segment. this segment holds this node's portion of the rank -> node
256257
* mapping array, node communication data (node_comm_info), state for all local ranks,

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
2222
* Copyright (c) 2019 Research Organization for Information Science
2323
* and Technology (RIST). All rights reserved.
24+
* Copyright (c) 2020 Google, LLC. All rights reserved.
2425
* $COPYRIGHT$
2526
*
2627
* Additional copyrights may follow
@@ -218,11 +219,11 @@ static int ompi_osc_rdma_component_register (void)
218219
MCA_BASE_VAR_SCOPE_LOCAL, &mca_osc_rdma_component.buffer_size);
219220
free(description_str);
220221

221-
mca_osc_rdma_component.max_attach = 32;
222+
mca_osc_rdma_component.max_attach = 64;
222223
opal_asprintf(&description_str, "Maximum number of buffers that can be attached to a dynamic window. "
223224
"Keep in mind that each attached buffer will use a potentially limited "
224225
"resource (default: %d)", mca_osc_rdma_component.max_attach);
225-
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach", description_str,
226+
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach", description_str,
226227
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
227228
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.max_attach);
228229
free(description_str);
@@ -1267,8 +1268,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
12671268

12681269
if (MPI_WIN_FLAVOR_DYNAMIC == flavor) {
12691270
/* allocate space to store local btl handles for attached regions */
1270-
module->dynamic_handles = (ompi_osc_rdma_handle_t *) calloc (mca_osc_rdma_component.max_attach,
1271-
sizeof (module->dynamic_handles[0]));
1271+
module->dynamic_handles = (ompi_osc_rdma_handle_t **) calloc (mca_osc_rdma_component.max_attach,
1272+
sizeof (module->dynamic_handles[0]));
12721273
if (NULL == module->dynamic_handles) {
12731274
ompi_osc_rdma_free (win);
12741275
return OMPI_ERR_OUT_OF_RESOURCE;

ompi/mca/osc/rdma/osc_rdma_dynamic.c

Lines changed: 153 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
/*
33
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
44
* reserved.
5+
* Copyright (c) 2020 Google, LLC. All rights reserved.
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -16,6 +17,22 @@
1617

1718
#include "opal/util/sys_limits.h"
1819

20+
static void ompi_osc_rdma_handle_init (ompi_osc_rdma_handle_t *rdma_handle)
21+
{
22+
rdma_handle->btl_handle = NULL;
23+
OBJ_CONSTRUCT(&rdma_handle->attachments, opal_list_t);
24+
}
25+
26+
static void ompi_osc_rdma_handle_fini (ompi_osc_rdma_handle_t *rdma_handle)
27+
{
28+
OPAL_LIST_DESTRUCT(&rdma_handle->attachments);
29+
}
30+
31+
OBJ_CLASS_INSTANCE(ompi_osc_rdma_handle_t, opal_object_t, ompi_osc_rdma_handle_init,
32+
ompi_osc_rdma_handle_fini);
33+
34+
OBJ_CLASS_INSTANCE(ompi_osc_rdma_attachment_t, opal_list_item_t, NULL, NULL);
35+
1936
/**
2037
* ompi_osc_rdma_find_region_containing:
2138
*
@@ -48,13 +65,16 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi
4865

4966
region_bound = (intptr_t) (region->base + region->len);
5067

51-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, max_index = %d)",
52-
(void *) base, (void *) bound, (void *) region->base, (void *)(region->base + region->len), mid_index,
53-
min_index, max_index);
68+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, "
69+
"max_index = %d)", (void *) base, (void *) bound, (void *) region->base,
70+
(void *)(region->base + region->len), mid_index, min_index, max_index);
5471

5572
if (region->base > base) {
56-
return ompi_osc_rdma_find_region_containing (regions, min_index, mid_index-1, base, bound, region_size, region_index);
57-
} else if (bound <= region_bound) {
73+
return ompi_osc_rdma_find_region_containing (regions, min_index, mid_index-1, base, bound, region_size,
74+
region_index);
75+
}
76+
77+
if (bound <= region_bound) {
5878
if (region_index) {
5979
*region_index = mid_index;
6080
}
@@ -66,24 +86,76 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi
6686
}
6787

6888
/* binary search for insertion point */
69-
static ompi_osc_rdma_region_t *find_insertion_point (ompi_osc_rdma_region_t *regions, int min_index, int max_index, intptr_t base,
70-
size_t region_size, int *region_index)
89+
static ompi_osc_rdma_region_t *find_insertion_point (ompi_osc_rdma_region_t *regions, int min_index, int max_index,
90+
intptr_t base, size_t region_size, int *region_index)
7191
{
7292
int mid_index = (max_index + min_index) >> 1;
7393
ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *)((intptr_t) regions + mid_index * region_size);
7494

75-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "find_insertion_point (%d, %d, %lx, %lu)\n", min_index, max_index, base, region_size);
95+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "find_insertion_point (%d, %d, %lx, %lu)\n", min_index, max_index, base,
96+
region_size);
7697

7798
if (max_index < min_index) {
7899
*region_index = min_index;
79100
return (ompi_osc_rdma_region_t *)((intptr_t) regions + min_index * region_size);
80101
}
81102

82-
if (region->base > base) {
103+
if (region->base > base || (region->base == base && region->len > region_size)) {
83104
return find_insertion_point (regions, min_index, mid_index-1, base, region_size, region_index);
84-
} else {
85-
return find_insertion_point (regions, mid_index+1, max_index, base, region_size, region_index);
86105
}
106+
107+
return find_insertion_point (regions, mid_index+1, max_index, base, region_size, region_index);
108+
}
109+
110+
static bool ompi_osc_rdma_find_conflicting_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base, intptr_t bound)
111+
{
112+
ompi_osc_rdma_attachment_t *attachment;
113+
114+
OPAL_LIST_FOREACH(attachment, &handle->attachments, ompi_osc_rdma_attachment_t) {
115+
intptr_t region_bound = attachment->base + attachment->len;
116+
if (base >= attachment->base && base < region_bound ||
117+
bound > attachment->base && bound <= region_bound) {
118+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "existing region {%p, %p} overlaps region {%p, %p}",
119+
(void *) attachment->base, (void *) region_bound, (void *) base, (void *) bound);
120+
return true;
121+
}
122+
}
123+
124+
return false;
125+
}
126+
127+
static int ompi_osc_rdma_add_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base, size_t len)
128+
{
129+
ompi_osc_rdma_attachment_t *attachment = OBJ_NEW(ompi_osc_rdma_attachment_t);
130+
assert (NULL != attachment);
131+
132+
if (ompi_osc_rdma_find_conflicting_attachment(handle, base, base + len)) {
133+
return OMPI_ERR_RMA_ATTACH;
134+
}
135+
136+
attachment->base = base;
137+
attachment->len = len;
138+
139+
opal_list_append (&handle->attachments, &attachment->super);
140+
141+
return OMPI_SUCCESS;
142+
}
143+
144+
static int ompi_osc_rdma_remove_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base)
145+
{
146+
ompi_osc_rdma_attachment_t *attachment;
147+
148+
OPAL_LIST_FOREACH(attachment, &handle->attachments, ompi_osc_rdma_attachment_t) {
149+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "checking attachment %p against %p",
150+
(void *) attachment->base, (void *) base);
151+
if (attachment->base == (intptr_t) base) {
152+
opal_list_remove_item (&handle->attachments, &attachment->super);
153+
OBJ_RELEASE(attachment);
154+
return OMPI_SUCCESS;
155+
}
156+
}
157+
158+
return OMPI_ERR_NOT_FOUND;
87159
}
88160

89161
int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
@@ -92,12 +164,13 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
92164
const int my_rank = ompi_comm_rank (module->comm);
93165
ompi_osc_rdma_peer_t *my_peer = ompi_osc_rdma_module_peer (module, my_rank);
94166
ompi_osc_rdma_region_t *region;
167+
ompi_osc_rdma_handle_t *rdma_region_handle;
95168
osc_rdma_counter_t region_count;
96169
osc_rdma_counter_t region_id;
97-
void *bound;
170+
intptr_t bound, aligned_base, aligned_bound;
98171
intptr_t page_size = opal_getpagesize ();
99-
int region_index;
100-
int ret;
172+
int region_index, ret;
173+
size_t aligned_len;
101174

102175
if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
103176
return OMPI_ERR_RMA_FLAVOR;
@@ -117,23 +190,26 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
117190

118191
if (region_count == mca_osc_rdma_component.max_attach) {
119192
OPAL_THREAD_UNLOCK(&module->lock);
193+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attach: could not attach. max attachment count reached.");
120194
return OMPI_ERR_RMA_ATTACH;
121195
}
122196

123197
/* it is wasteful to register less than a page. this may allow the remote side to access more
124198
* memory but the MPI standard covers this with calling the calling behavior erroneous */
125-
bound = (void *)OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t);
126-
base = (void *)((intptr_t) base & ~(page_size - 1));
127-
len = (size_t)((intptr_t) bound - (intptr_t) base);
128-
129-
/* see if a matching region already exists */
130-
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
131-
(intptr_t) bound, module->region_size, &region_index);
199+
bound = (intptr_t) base + len;
200+
aligned_bound = OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t);
201+
aligned_base = (intptr_t) base & ~(page_size - 1);
202+
aligned_len = (size_t)(aligned_bound - aligned_base);
203+
204+
/* see if a registered region already exists */
205+
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1,
206+
aligned_base, aligned_bound, module->region_size, &region_index);
132207
if (NULL != region) {
133-
++module->dynamic_handles[region_index].refcnt;
208+
/* validates that the region does not overlap with an existing region even if they are on the same page */
209+
ret = ompi_osc_rdma_add_attachment (module->dynamic_handles[region_index], (intptr_t) base, len);
134210
OPAL_THREAD_UNLOCK(&module->lock);
135211
/* no need to invalidate remote caches */
136-
return OMPI_SUCCESS;
212+
return ret;
137213
}
138214

139215
/* region is in flux */
@@ -144,45 +220,50 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
144220

145221
/* do a binary seach for where the region should be inserted */
146222
if (region_count) {
147-
region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
148-
module->region_size, &region_index);
223+
region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1,
224+
(intptr_t) base, module->region_size, &region_index);
149225

150226
if (region_index < region_count) {
151-
memmove ((void *) ((intptr_t) region + module->region_size), region, (region_count - region_index) * module->region_size);
152-
153-
if (module->selected_btl->btl_register_mem) {
154-
memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index,
155-
(region_count - region_index) * sizeof (module->dynamic_handles[0]));
156-
}
227+
memmove ((void *) ((intptr_t) region + module->region_size), region,
228+
(region_count - region_index) * module->region_size);
229+
memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index,
230+
(region_count - region_index) * sizeof (module->dynamic_handles[0]));
157231
}
158232
} else {
159233
region_index = 0;
160234
region = (ompi_osc_rdma_region_t *) module->state->regions;
161235
}
162236

163-
region->base = (intptr_t) base;
164-
region->len = len;
237+
region->base = aligned_base;
238+
region->len = aligned_len;
239+
240+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} aligned {%p, %p}, at index %d",
241+
base, (void *) bound, (void *) aligned_base, (void *) aligned_bound, region_index);
165242

166-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} at index %d",
167-
base, (void *)((intptr_t) base + len), region_index);
243+
/* add RDMA region handle to track this region */
244+
rdma_region_handle = OBJ_NEW(ompi_osc_rdma_handle_t);
245+
assert (NULL != rdma_region_handle);
168246

169247
if (module->selected_btl->btl_register_mem) {
170248
mca_btl_base_registration_handle_t *handle;
171249

172-
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len, MCA_BTL_REG_FLAG_ACCESS_ANY,
173-
&handle);
250+
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len,
251+
MCA_BTL_REG_FLAG_ACCESS_ANY, &handle);
174252
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
175253
OPAL_THREAD_UNLOCK(&module->lock);
254+
OBJ_RELEASE(rdma_region_handle);
176255
return OMPI_ERR_RMA_ATTACH;
177256
}
178257

179258
memcpy (region->btl_handle_data, handle, module->selected_btl->btl_registration_handle_size);
180-
module->dynamic_handles[region_index].btl_handle = handle;
259+
rdma_region_handle->btl_handle = handle;
181260
} else {
182-
module->dynamic_handles[region_index].btl_handle = NULL;
261+
rdma_region_handle->btl_handle = NULL;
183262
}
184263

185-
module->dynamic_handles[region_index].refcnt = 1;
264+
assert(OMPI_SUCCESS == ompi_osc_rdma_add_attachment (rdma_region_handle, (intptr_t) base, len));
265+
266+
module->dynamic_handles[region_index] = rdma_region_handle;
186267

187268
#if OPAL_ENABLE_DEBUG
188269
for (int i = 0 ; i < region_count + 1 ; ++i) {
@@ -211,34 +292,46 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
211292
ompi_osc_rdma_module_t *module = GET_MODULE(win);
212293
const int my_rank = ompi_comm_rank (module->comm);
213294
ompi_osc_rdma_peer_dynamic_t *my_peer = (ompi_osc_rdma_peer_dynamic_t *) ompi_osc_rdma_module_peer (module, my_rank);
295+
ompi_osc_rdma_handle_t *rdma_region_handle;
214296
osc_rdma_counter_t region_count, region_id;
215297
ompi_osc_rdma_region_t *region;
216-
int region_index;
298+
void *bound;
299+
int start_index = INT_MAX, region_index;
217300

218301
if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
219302
return OMPI_ERR_WIN;
220303
}
221304

222305
OPAL_THREAD_LOCK(&module->lock);
223306

224-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach: %s, %p", win->w_name, base);
225-
226307
/* the upper 4 bytes of the region count are an instance counter */
227308
region_count = module->state->region_count & 0xffffffffL;
228309
region_id = module->state->region_count >> 32;
229310

230-
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0,
231-
region_count - 1, (intptr_t) base, (intptr_t) base + 1,
232-
module->region_size, &region_index);
233-
if (NULL == region) {
234-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "could not find dynamic memory region starting at %p", base);
235-
OPAL_THREAD_UNLOCK(&module->lock);
236-
return OMPI_ERROR;
311+
/* look up the associated region */
312+
for (region_index = 0 ; region_index < region_count ; ++region_index) {
313+
rdma_region_handle = module->dynamic_handles[region_index];
314+
region = (ompi_osc_rdma_region_t *) ((intptr_t) module->state->regions + region_index * module->region_size);
315+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "checking attachments at index %d {.base=%p, len=%lu} for attachment %p"
316+
", region handle=%p", region_index, (void *) region->base, region->len, base, rdma_region_handle);
317+
318+
if (region->base > (uintptr_t) base || (region->base + region->len) < (uintptr_t) base) {
319+
continue;
320+
}
321+
322+
if (OPAL_SUCCESS == ompi_osc_rdma_remove_attachment (rdma_region_handle, (intptr_t) base)) {
323+
break;
324+
}
237325
}
238326

239-
if (--module->dynamic_handles[region_index].refcnt > 0) {
327+
if (region_index == region_count) {
328+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "could not find dynamic memory attachment for %p", base);
240329
OPAL_THREAD_UNLOCK(&module->lock);
241-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach complete");
330+
return OMPI_ERR_BASE;
331+
}
332+
333+
if (!opal_list_is_empty (&rdma_region_handle->attachments)) {
334+
/* another region is referencing this attachment */
242335
return OMPI_SUCCESS;
243336
}
244337

@@ -249,21 +342,21 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
249342
base, (void *)((intptr_t) base + region->len), region_index);
250343

251344
if (module->selected_btl->btl_register_mem) {
252-
ompi_osc_rdma_deregister (module, module->dynamic_handles[region_index].btl_handle);
345+
ompi_osc_rdma_deregister (module, rdma_region_handle->btl_handle);
253346

254-
if (region_index < region_count - 1) {
255-
memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1,
256-
(region_count - region_index - 1) * sizeof (void *));
257-
}
258-
259-
memset (module->dynamic_handles + region_count - 1, 0, sizeof (module->dynamic_handles[0]));
260347
}
261348

262349
if (region_index < region_count - 1) {
350+
size_t end_count = region_count - region_index - 1;
351+
memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1,
352+
end_count * sizeof (module->dynamic_handles[0]));
263353
memmove (region, (void *)((intptr_t) region + module->region_size),
264-
(region_count - region_index - 1) * module->region_size);;
354+
end_count * module->region_size);
265355
}
266356

357+
OBJ_RELEASE(rdma_region_handle);
358+
module->dynamic_handles[region_count - 1] = NULL;
359+
267360
module->state->region_count = ((region_id + 1) << 32) | (region_count - 1);
268361

269362
ompi_osc_rdma_lock_release_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));

0 commit comments

Comments
 (0)