Skip to content

Commit 6649aef

Browse files
committed
osc/rdma: modify attach to check for region overlap
This commit addresses two issues in osc/rdma: 1) It is erroneous to attach regions that overlap. This was being allowed but the standard does not allow overlapping attachments. 2) Overlapping registration regions (4k alignment of attachments) appear to be allowed. Add attachment bases to the bookeeping structure so we can keep better track of what can be detached. It is possible that the standard did not intend to allow #2. If that is the case then #2 should fail in the same way as #1. There should be no technical reason to disallow #2 at this time. References #7384 Signed-off-by: Nathan Hjelm <hjelmn@google.com>
1 parent 5025628 commit 6649aef

File tree

5 files changed

+183
-67
lines changed

5 files changed

+183
-67
lines changed

ompi/mca/osc/rdma/osc_rdma.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
1515
* Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
16+
* Copyright (c) 2020 Google, LLC. All rights reserved.
1617
* $COPYRIGHT$
1718
*
1819
* Additional copyrights may follow
@@ -250,7 +251,7 @@ struct ompi_osc_rdma_module_t {
250251

251252
/** registration handles for dynamically attached regions. These are not stored
252253
* in the state structure as it is entirely local. */
253-
ompi_osc_rdma_handle_t *dynamic_handles;
254+
ompi_osc_rdma_handle_t **dynamic_handles;
254255

255256
/** shared memory segment. this segment holds this node's portion of the rank -> node
256257
* mapping array, node communication data (node_comm_info), state for all local ranks,

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
2222
* Copyright (c) 2019 Research Organization for Information Science
2323
* and Technology (RIST). All rights reserved.
24+
* Copyright (c) 2020 Google, LLC. All rights reserved.
2425
* $COPYRIGHT$
2526
*
2627
* Additional copyrights may follow
@@ -222,7 +223,7 @@ static int ompi_osc_rdma_component_register (void)
222223
opal_asprintf(&description_str, "Maximum number of buffers that can be attached to a dynamic window. "
223224
"Keep in mind that each attached buffer will use a potentially limited "
224225
"resource (default: %d)", mca_osc_rdma_component.max_attach);
225-
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach", description_str,
226+
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach", description_str,
226227
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
227228
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.max_attach);
228229
free(description_str);
@@ -1267,8 +1268,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
12671268

12681269
if (MPI_WIN_FLAVOR_DYNAMIC == flavor) {
12691270
/* allocate space to store local btl handles for attached regions */
1270-
module->dynamic_handles = (ompi_osc_rdma_handle_t *) calloc (mca_osc_rdma_component.max_attach,
1271-
sizeof (module->dynamic_handles[0]));
1271+
module->dynamic_handles = (ompi_osc_rdma_handle_t **) calloc (mca_osc_rdma_component.max_attach,
1272+
sizeof (module->dynamic_handles[0]));
12721273
if (NULL == module->dynamic_handles) {
12731274
ompi_osc_rdma_free (win);
12741275
return OMPI_ERR_OUT_OF_RESOURCE;

ompi/mca/osc/rdma/osc_rdma_dynamic.c

Lines changed: 153 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
/*
33
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
44
* reserved.
5+
* Copyright (c) 2020 Google, LLC. All rights reserved.
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -16,6 +17,22 @@
1617

1718
#include "opal/util/sys_limits.h"
1819

20+
static void ompi_osc_rdma_handle_init (ompi_osc_rdma_handle_t *rdma_handle)
21+
{
22+
rdma_handle->btl_handle = NULL;
23+
OBJ_CONSTRUCT(&rdma_handle->attachments, opal_list_t);
24+
}
25+
26+
static void ompi_osc_rdma_handle_fini (ompi_osc_rdma_handle_t *rdma_handle)
27+
{
28+
OPAL_LIST_DESTRUCT(&rdma_handle->attachments);
29+
}
30+
31+
OBJ_CLASS_INSTANCE(ompi_osc_rdma_handle_t, opal_object_t, ompi_osc_rdma_handle_init,
32+
ompi_osc_rdma_handle_fini);
33+
34+
OBJ_CLASS_INSTANCE(ompi_osc_rdma_attachment_t, opal_list_item_t, NULL, NULL);
35+
1936
/**
2037
* ompi_osc_rdma_find_region_containing:
2138
*
@@ -48,13 +65,16 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi
4865

4966
region_bound = (intptr_t) (region->base + region->len);
5067

51-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, max_index = %d)",
52-
(void *) base, (void *) bound, (void *) region->base, (void *)(region->base + region->len), mid_index,
53-
min_index, max_index);
68+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, "
69+
"max_index = %d)", (void *) base, (void *) bound, (void *) region->base,
70+
(void *)(region->base + region->len), mid_index, min_index, max_index);
5471

5572
if (region->base > base) {
56-
return ompi_osc_rdma_find_region_containing (regions, min_index, mid_index-1, base, bound, region_size, region_index);
57-
} else if (bound <= region_bound) {
73+
return ompi_osc_rdma_find_region_containing (regions, min_index, mid_index-1, base, bound, region_size,
74+
region_index);
75+
}
76+
77+
if (bound <= region_bound) {
5878
if (region_index) {
5979
*region_index = mid_index;
6080
}
@@ -66,24 +86,76 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi
6686
}
6787

6888
/* binary search for insertion point */
69-
static ompi_osc_rdma_region_t *find_insertion_point (ompi_osc_rdma_region_t *regions, int min_index, int max_index, intptr_t base,
70-
size_t region_size, int *region_index)
89+
static ompi_osc_rdma_region_t *find_insertion_point (ompi_osc_rdma_region_t *regions, int min_index, int max_index,
90+
intptr_t base, size_t region_size, int *region_index)
7191
{
7292
int mid_index = (max_index + min_index) >> 1;
7393
ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *)((intptr_t) regions + mid_index * region_size);
7494

75-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "find_insertion_point (%d, %d, %lx, %lu)\n", min_index, max_index, base, region_size);
95+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "find_insertion_point (%d, %d, %lx, %lu)\n", min_index, max_index, base,
96+
region_size);
7697

7798
if (max_index < min_index) {
7899
*region_index = min_index;
79100
return (ompi_osc_rdma_region_t *)((intptr_t) regions + min_index * region_size);
80101
}
81102

82-
if (region->base > base) {
103+
if (region->base > base || (region->base == base && region->len > region_size)) {
83104
return find_insertion_point (regions, min_index, mid_index-1, base, region_size, region_index);
84-
} else {
85-
return find_insertion_point (regions, mid_index+1, max_index, base, region_size, region_index);
86105
}
106+
107+
return find_insertion_point (regions, mid_index+1, max_index, base, region_size, region_index);
108+
}
109+
110+
static bool ompi_osc_rdma_find_conflicting_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base, intptr_t bound)
111+
{
112+
ompi_osc_rdma_attachment_t *attachment;
113+
114+
OPAL_LIST_FOREACH(attachment, &handle->attachments, ompi_osc_rdma_attachment_t) {
115+
intptr_t region_bound = attachment->base + attachment->len;
116+
if (base >= attachment->base && base < region_bound ||
117+
bound > attachment->base && bound <= region_bound) {
118+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "existing region {%p, %p} overlaps region {%p, %p}",
119+
(void *) attachment->base, (void *) region_bound, (void *) base, (void *) bound);
120+
return true;
121+
}
122+
}
123+
124+
return false;
125+
}
126+
127+
static int ompi_osc_rdma_add_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base, size_t len)
128+
{
129+
ompi_osc_rdma_attachment_t *attachment = OBJ_NEW(ompi_osc_rdma_attachment_t);
130+
assert (NULL != attachment);
131+
132+
if (ompi_osc_rdma_find_conflicting_attachment(handle, base, base + len)) {
133+
return OMPI_ERR_RMA_ATTACH;
134+
}
135+
136+
attachment->base = base;
137+
attachment->len = len;
138+
139+
opal_list_append (&handle->attachments, &attachment->super);
140+
141+
return OMPI_SUCCESS;
142+
}
143+
144+
static int ompi_osc_rdma_remove_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base)
145+
{
146+
ompi_osc_rdma_attachment_t *attachment;
147+
148+
OPAL_LIST_FOREACH(attachment, &handle->attachments, ompi_osc_rdma_attachment_t) {
149+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "checking attachment %p against %p",
150+
(void *) attachment->base, (void *) base);
151+
if (attachment->base == (intptr_t) base) {
152+
opal_list_remove_item (&handle->attachments, &attachment->super);
153+
OBJ_RELEASE(attachment);
154+
return OMPI_SUCCESS;
155+
}
156+
}
157+
158+
return OMPI_ERR_NOT_FOUND;
87159
}
88160

89161
int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
@@ -92,12 +164,13 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
92164
const int my_rank = ompi_comm_rank (module->comm);
93165
ompi_osc_rdma_peer_t *my_peer = ompi_osc_rdma_module_peer (module, my_rank);
94166
ompi_osc_rdma_region_t *region;
167+
ompi_osc_rdma_handle_t *rdma_region_handle;
95168
osc_rdma_counter_t region_count;
96169
osc_rdma_counter_t region_id;
97-
void *bound;
170+
intptr_t bound, aligned_base, aligned_bound;
98171
intptr_t page_size = opal_getpagesize ();
99-
int region_index;
100-
int ret;
172+
int region_index, ret;
173+
size_t aligned_len;
101174

102175
if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
103176
return OMPI_ERR_RMA_FLAVOR;
@@ -117,23 +190,26 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
117190

118191
if (region_count == mca_osc_rdma_component.max_attach) {
119192
OPAL_THREAD_UNLOCK(&module->lock);
193+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attach: could not attach. max attachment count reached.");
120194
return OMPI_ERR_RMA_ATTACH;
121195
}
122196

123197
/* it is wasteful to register less than a page. this may allow the remote side to access more
124198
* memory but the MPI standard covers this with calling the calling behavior erroneous */
125-
bound = (void *)OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t);
126-
base = (void *)((intptr_t) base & ~(page_size - 1));
127-
len = (size_t)((intptr_t) bound - (intptr_t) base);
128-
129-
/* see if a matching region already exists */
130-
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
131-
(intptr_t) bound, module->region_size, &region_index);
199+
bound = (intptr_t) base + len;
200+
aligned_bound = OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t);
201+
aligned_base = (intptr_t) base & ~(page_size - 1);
202+
aligned_len = (size_t)(aligned_bound - aligned_base);
203+
204+
/* see if a registered region already exists */
205+
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1,
206+
aligned_base, aligned_bound, module->region_size, &region_index);
132207
if (NULL != region) {
133-
++module->dynamic_handles[region_index].refcnt;
208+
/* validates that the region does not overlap with an existing region even if they are on the same page */
209+
ret = ompi_osc_rdma_add_attachment (module->dynamic_handles[region_index], (intptr_t) base, len);
134210
OPAL_THREAD_UNLOCK(&module->lock);
135211
/* no need to invalidate remote caches */
136-
return OMPI_SUCCESS;
212+
return ret;
137213
}
138214

139215
/* region is in flux */
@@ -144,45 +220,50 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
144220

145221
/* do a binary seach for where the region should be inserted */
146222
if (region_count) {
147-
region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
148-
module->region_size, &region_index);
223+
region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1,
224+
(intptr_t) base, module->region_size, &region_index);
149225

150226
if (region_index < region_count) {
151-
memmove ((void *) ((intptr_t) region + module->region_size), region, (region_count - region_index) * module->region_size);
152-
153-
if (module->selected_btl->btl_register_mem) {
154-
memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index,
155-
(region_count - region_index) * sizeof (module->dynamic_handles[0]));
156-
}
227+
memmove ((void *) ((intptr_t) region + module->region_size), region,
228+
(region_count - region_index) * module->region_size);
229+
memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index,
230+
(region_count - region_index) * sizeof (module->dynamic_handles[0]));
157231
}
158232
} else {
159233
region_index = 0;
160234
region = (ompi_osc_rdma_region_t *) module->state->regions;
161235
}
162236

163-
region->base = (intptr_t) base;
164-
region->len = len;
237+
region->base = aligned_base;
238+
region->len = aligned_len;
239+
240+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} aligned {%p, %p}, at index %d",
241+
base, (void *) bound, (void *) aligned_base, (void *) aligned_bound, region_index);
165242

166-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} at index %d",
167-
base, (void *)((intptr_t) base + len), region_index);
243+
/* add RDMA region handle to track this region */
244+
rdma_region_handle = OBJ_NEW(ompi_osc_rdma_handle_t);
245+
assert (NULL != rdma_region_handle);
168246

169247
if (module->selected_btl->btl_register_mem) {
170248
mca_btl_base_registration_handle_t *handle;
171249

172-
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len, MCA_BTL_REG_FLAG_ACCESS_ANY,
173-
&handle);
250+
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len,
251+
MCA_BTL_REG_FLAG_ACCESS_ANY, &handle);
174252
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
175253
OPAL_THREAD_UNLOCK(&module->lock);
254+
OBJ_RELEASE(rdma_region_handle);
176255
return OMPI_ERR_RMA_ATTACH;
177256
}
178257

179258
memcpy (region->btl_handle_data, handle, module->selected_btl->btl_registration_handle_size);
180-
module->dynamic_handles[region_index].btl_handle = handle;
259+
rdma_region_handle->btl_handle = handle;
181260
} else {
182-
module->dynamic_handles[region_index].btl_handle = NULL;
261+
rdma_region_handle->btl_handle = NULL;
183262
}
184263

185-
module->dynamic_handles[region_index].refcnt = 1;
264+
assert(OMPI_SUCCESS == ompi_osc_rdma_add_attachment (rdma_region_handle, (intptr_t) base, len));
265+
266+
module->dynamic_handles[region_index] = rdma_region_handle;
186267

187268
#if OPAL_ENABLE_DEBUG
188269
for (int i = 0 ; i < region_count + 1 ; ++i) {
@@ -211,34 +292,46 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
211292
ompi_osc_rdma_module_t *module = GET_MODULE(win);
212293
const int my_rank = ompi_comm_rank (module->comm);
213294
ompi_osc_rdma_peer_dynamic_t *my_peer = (ompi_osc_rdma_peer_dynamic_t *) ompi_osc_rdma_module_peer (module, my_rank);
295+
ompi_osc_rdma_handle_t *rdma_region_handle;
214296
osc_rdma_counter_t region_count, region_id;
215297
ompi_osc_rdma_region_t *region;
216-
int region_index;
298+
void *bound;
299+
int start_index = INT_MAX, region_index;
217300

218301
if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
219302
return OMPI_ERR_WIN;
220303
}
221304

222305
OPAL_THREAD_LOCK(&module->lock);
223306

224-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach: %s, %p", win->w_name, base);
225-
226307
/* the upper 4 bytes of the region count are an instance counter */
227308
region_count = module->state->region_count & 0xffffffffL;
228309
region_id = module->state->region_count >> 32;
229310

230-
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0,
231-
region_count - 1, (intptr_t) base, (intptr_t) base + 1,
232-
module->region_size, &region_index);
233-
if (NULL == region) {
234-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "could not find dynamic memory region starting at %p", base);
235-
OPAL_THREAD_UNLOCK(&module->lock);
236-
return OMPI_ERROR;
311+
/* look up the associated region */
312+
for (region_index = 0 ; region_index < region_count ; ++region_index) {
313+
rdma_region_handle = module->dynamic_handles[region_index];
314+
region = (ompi_osc_rdma_region_t *) ((intptr_t) module->state->regions + region_index * module->region_size);
315+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "checking attachments at index %d {.base=%p, len=%lu} for attachment %p"
316+
", region handle=%p", region_index, (void *) region->base, region->len, base, rdma_region_handle);
317+
318+
if (region->base > (uintptr_t) base || (region->base + region->len) < (uintptr_t) base) {
319+
continue;
320+
}
321+
322+
if (OPAL_SUCCESS == ompi_osc_rdma_remove_attachment (rdma_region_handle, (intptr_t) base)) {
323+
break;
324+
}
237325
}
238326

239-
if (--module->dynamic_handles[region_index].refcnt > 0) {
327+
if (region_index == region_count) {
328+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "could not find dynamic memory attachment for %p", base);
240329
OPAL_THREAD_UNLOCK(&module->lock);
241-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach complete");
330+
return OMPI_ERR_BASE;
331+
}
332+
333+
if (!opal_list_is_empty (&rdma_region_handle->attachments)) {
334+
/* another region is referencing this attachment */
242335
return OMPI_SUCCESS;
243336
}
244337

@@ -249,21 +342,21 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
249342
base, (void *)((intptr_t) base + region->len), region_index);
250343

251344
if (module->selected_btl->btl_register_mem) {
252-
ompi_osc_rdma_deregister (module, module->dynamic_handles[region_index].btl_handle);
345+
ompi_osc_rdma_deregister (module, rdma_region_handle->btl_handle);
253346

254-
if (region_index < region_count - 1) {
255-
memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1,
256-
(region_count - region_index - 1) * sizeof (void *));
257-
}
258-
259-
memset (module->dynamic_handles + region_count - 1, 0, sizeof (module->dynamic_handles[0]));
260347
}
261348

262349
if (region_index < region_count - 1) {
350+
size_t end_count = region_count - region_index - 1;
351+
memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1,
352+
end_count * sizeof (module->dynamic_handles[0]));
263353
memmove (region, (void *)((intptr_t) region + module->region_size),
264-
(region_count - region_index - 1) * module->region_size);;
354+
end_count * module->region_size);
265355
}
266356

357+
OBJ_RELEASE(rdma_region_handle);
358+
module->dynamic_handles[region_count - 1] = NULL;
359+
267360
module->state->region_count = ((region_id + 1) << 32) | (region_count - 1);
268361

269362
ompi_osc_rdma_lock_release_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));

0 commit comments

Comments
 (0)