2
2
/*
3
3
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
4
4
* reserved.
5
+ * Copyright (c) 2020 Google, LLC. All rights reserved.
5
6
* $COPYRIGHT$
6
7
*
7
8
* Additional copyrights may follow
16
17
17
18
#include "opal/util/sys_limits.h"
18
19
20
+ static void ompi_osc_rdma_handle_init (ompi_osc_rdma_handle_t * rdma_handle )
21
+ {
22
+ rdma_handle -> btl_handle = NULL ;
23
+ OBJ_CONSTRUCT (& rdma_handle -> attachments , opal_list_t );
24
+ }
25
+
26
+ static void ompi_osc_rdma_handle_fini (ompi_osc_rdma_handle_t * rdma_handle )
27
+ {
28
+ OPAL_LIST_DESTRUCT (& rdma_handle -> attachments );
29
+ }
30
+
31
+ OBJ_CLASS_INSTANCE (ompi_osc_rdma_handle_t , opal_object_t , ompi_osc_rdma_handle_init ,
32
+ ompi_osc_rdma_handle_fini );
33
+
34
+ OBJ_CLASS_INSTANCE (ompi_osc_rdma_attachment_t , opal_list_item_t , NULL , NULL );
35
+
19
36
/**
20
37
* ompi_osc_rdma_find_region_containing:
21
38
*
@@ -48,13 +65,16 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi
48
65
49
66
region_bound = (intptr_t ) (region -> base + region -> len );
50
67
51
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_DEBUG , "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, max_index = %d)" ,
52
- (void * ) base , (void * ) bound , (void * ) region -> base , ( void * )( region -> base + region -> len ), mid_index ,
53
- min_index , max_index );
68
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_DEBUG , "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, "
69
+ "max_index = %d)" , (void * ) base , (void * ) bound , (void * ) region -> base ,
70
+ ( void * )( region -> base + region -> len ), mid_index , min_index , max_index );
54
71
55
72
if (region -> base > base ) {
56
- return ompi_osc_rdma_find_region_containing (regions , min_index , mid_index - 1 , base , bound , region_size , region_index );
57
- } else if (bound <= region_bound ) {
73
+ return ompi_osc_rdma_find_region_containing (regions , min_index , mid_index - 1 , base , bound , region_size ,
74
+ region_index );
75
+ }
76
+
77
+ if (bound <= region_bound ) {
58
78
if (region_index ) {
59
79
* region_index = mid_index ;
60
80
}
@@ -66,24 +86,76 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi
66
86
}
67
87
68
88
/* binary search for insertion point */
69
- static ompi_osc_rdma_region_t * find_insertion_point (ompi_osc_rdma_region_t * regions , int min_index , int max_index , intptr_t base ,
70
- size_t region_size , int * region_index )
89
+ static ompi_osc_rdma_region_t * find_insertion_point (ompi_osc_rdma_region_t * regions , int min_index , int max_index ,
90
+ intptr_t base , size_t region_size , int * region_index )
71
91
{
72
92
int mid_index = (max_index + min_index ) >> 1 ;
73
93
ompi_osc_rdma_region_t * region = (ompi_osc_rdma_region_t * )((intptr_t ) regions + mid_index * region_size );
74
94
75
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "find_insertion_point (%d, %d, %lx, %lu)\n" , min_index , max_index , base , region_size );
95
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "find_insertion_point (%d, %d, %lx, %lu)\n" , min_index , max_index , base ,
96
+ region_size );
76
97
77
98
if (max_index < min_index ) {
78
99
* region_index = min_index ;
79
100
return (ompi_osc_rdma_region_t * )((intptr_t ) regions + min_index * region_size );
80
101
}
81
102
82
- if (region -> base > base ) {
103
+ if (region -> base > base || ( region -> base == base && region -> len > region_size ) ) {
83
104
return find_insertion_point (regions , min_index , mid_index - 1 , base , region_size , region_index );
84
- } else {
85
- return find_insertion_point (regions , mid_index + 1 , max_index , base , region_size , region_index );
86
105
}
106
+
107
+ return find_insertion_point (regions , mid_index + 1 , max_index , base , region_size , region_index );
108
+ }
109
+
110
+ static bool ompi_osc_rdma_find_conflicting_attachment (ompi_osc_rdma_handle_t * handle , intptr_t base , intptr_t bound )
111
+ {
112
+ ompi_osc_rdma_attachment_t * attachment ;
113
+
114
+ OPAL_LIST_FOREACH (attachment , & handle -> attachments , ompi_osc_rdma_attachment_t ) {
115
+ intptr_t region_bound = attachment -> base + attachment -> len ;
116
+ if (base >= attachment -> base && base < region_bound ||
117
+ bound > attachment -> base && bound <= region_bound ) {
118
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "existing region {%p, %p} overlaps region {%p, %p}" ,
119
+ (void * ) attachment -> base , (void * ) region_bound , (void * ) base , (void * ) bound );
120
+ return true;
121
+ }
122
+ }
123
+
124
+ return false;
125
+ }
126
+
127
+ static int ompi_osc_rdma_add_attachment (ompi_osc_rdma_handle_t * handle , intptr_t base , size_t len )
128
+ {
129
+ ompi_osc_rdma_attachment_t * attachment = OBJ_NEW (ompi_osc_rdma_attachment_t );
130
+ assert (NULL != attachment );
131
+
132
+ if (ompi_osc_rdma_find_conflicting_attachment (handle , base , base + len )) {
133
+ return OMPI_ERR_RMA_ATTACH ;
134
+ }
135
+
136
+ attachment -> base = base ;
137
+ attachment -> len = len ;
138
+
139
+ opal_list_append (& handle -> attachments , & attachment -> super );
140
+
141
+ return OMPI_SUCCESS ;
142
+ }
143
+
144
+ static int ompi_osc_rdma_remove_attachment (ompi_osc_rdma_handle_t * handle , intptr_t base )
145
+ {
146
+ ompi_osc_rdma_attachment_t * attachment ;
147
+
148
+ OPAL_LIST_FOREACH (attachment , & handle -> attachments , ompi_osc_rdma_attachment_t ) {
149
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "checking attachment %p against %p" ,
150
+ (void * ) attachment -> base , (void * ) base );
151
+ if (attachment -> base == (intptr_t ) base ) {
152
+ opal_list_remove_item (& handle -> attachments , & attachment -> super );
153
+ OBJ_RELEASE (attachment );
154
+ return OMPI_SUCCESS ;
155
+ }
156
+ }
157
+
158
+ return OMPI_ERR_NOT_FOUND ;
87
159
}
88
160
89
161
int ompi_osc_rdma_attach (struct ompi_win_t * win , void * base , size_t len )
@@ -92,12 +164,13 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
92
164
const int my_rank = ompi_comm_rank (module -> comm );
93
165
ompi_osc_rdma_peer_t * my_peer = ompi_osc_rdma_module_peer (module , my_rank );
94
166
ompi_osc_rdma_region_t * region ;
167
+ ompi_osc_rdma_handle_t * rdma_region_handle ;
95
168
osc_rdma_counter_t region_count ;
96
169
osc_rdma_counter_t region_id ;
97
- void * bound ;
170
+ intptr_t bound , aligned_base , aligned_bound ;
98
171
intptr_t page_size = opal_getpagesize ();
99
- int region_index ;
100
- int ret ;
172
+ int region_index , ret ;
173
+ size_t aligned_len ;
101
174
102
175
if (module -> flavor != MPI_WIN_FLAVOR_DYNAMIC ) {
103
176
return OMPI_ERR_RMA_FLAVOR ;
@@ -117,23 +190,26 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
117
190
118
191
if (region_count == mca_osc_rdma_component .max_attach ) {
119
192
OPAL_THREAD_UNLOCK (& module -> lock );
193
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "attach: could not attach. max attachment count reached." );
120
194
return OMPI_ERR_RMA_ATTACH ;
121
195
}
122
196
123
197
/* it is wasteful to register less than a page. this may allow the remote side to access more
124
198
* memory but the MPI standard covers this with calling the calling behavior erroneous */
125
- bound = (void * )OPAL_ALIGN ((intptr_t ) base + len , page_size , intptr_t );
126
- base = (void * )((intptr_t ) base & ~(page_size - 1 ));
127
- len = (size_t )((intptr_t ) bound - (intptr_t ) base );
128
-
129
- /* see if a matching region already exists */
130
- region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t * ) module -> state -> regions , 0 , region_count - 1 , (intptr_t ) base ,
131
- (intptr_t ) bound , module -> region_size , & region_index );
199
+ bound = (intptr_t ) base + len ;
200
+ aligned_bound = OPAL_ALIGN ((intptr_t ) base + len , page_size , intptr_t );
201
+ aligned_base = (intptr_t ) base & ~(page_size - 1 );
202
+ aligned_len = (size_t )(aligned_bound - aligned_base );
203
+
204
+ /* see if a registered region already exists */
205
+ region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t * ) module -> state -> regions , 0 , region_count - 1 ,
206
+ aligned_base , aligned_bound , module -> region_size , & region_index );
132
207
if (NULL != region ) {
133
- ++ module -> dynamic_handles [region_index ].refcnt ;
208
+ /* validates that the region does not overlap with an existing region even if they are on the same page */
209
+ ret = ompi_osc_rdma_add_attachment (module -> dynamic_handles [region_index ], (intptr_t ) base , len );
134
210
OPAL_THREAD_UNLOCK (& module -> lock );
135
211
/* no need to invalidate remote caches */
136
- return OMPI_SUCCESS ;
212
+ return ret ;
137
213
}
138
214
139
215
/* region is in flux */
@@ -144,45 +220,50 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
144
220
145
221
/* do a binary seach for where the region should be inserted */
146
222
if (region_count ) {
147
- region = find_insertion_point ((ompi_osc_rdma_region_t * ) module -> state -> regions , 0 , region_count - 1 , ( intptr_t ) base ,
148
- module -> region_size , & region_index );
223
+ region = find_insertion_point ((ompi_osc_rdma_region_t * ) module -> state -> regions , 0 , region_count - 1 ,
224
+ ( intptr_t ) base , module -> region_size , & region_index );
149
225
150
226
if (region_index < region_count ) {
151
- memmove ((void * ) ((intptr_t ) region + module -> region_size ), region , (region_count - region_index ) * module -> region_size );
152
-
153
- if (module -> selected_btl -> btl_register_mem ) {
154
- memmove (module -> dynamic_handles + region_index + 1 , module -> dynamic_handles + region_index ,
155
- (region_count - region_index ) * sizeof (module -> dynamic_handles [0 ]));
156
- }
227
+ memmove ((void * ) ((intptr_t ) region + module -> region_size ), region ,
228
+ (region_count - region_index ) * module -> region_size );
229
+ memmove (module -> dynamic_handles + region_index + 1 , module -> dynamic_handles + region_index ,
230
+ (region_count - region_index ) * sizeof (module -> dynamic_handles [0 ]));
157
231
}
158
232
} else {
159
233
region_index = 0 ;
160
234
region = (ompi_osc_rdma_region_t * ) module -> state -> regions ;
161
235
}
162
236
163
- region -> base = (intptr_t ) base ;
164
- region -> len = len ;
237
+ region -> base = aligned_base ;
238
+ region -> len = aligned_len ;
239
+
240
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_DEBUG , "attaching dynamic memory region {%p, %p} aligned {%p, %p}, at index %d" ,
241
+ base , (void * ) bound , (void * ) aligned_base , (void * ) aligned_bound , region_index );
165
242
166
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_DEBUG , "attaching dynamic memory region {%p, %p} at index %d" ,
167
- base , (void * )((intptr_t ) base + len ), region_index );
243
+ /* add RDMA region handle to track this region */
244
+ rdma_region_handle = OBJ_NEW (ompi_osc_rdma_handle_t );
245
+ assert (NULL != rdma_region_handle );
168
246
169
247
if (module -> selected_btl -> btl_register_mem ) {
170
248
mca_btl_base_registration_handle_t * handle ;
171
249
172
- ret = ompi_osc_rdma_register (module , MCA_BTL_ENDPOINT_ANY , (void * ) region -> base , region -> len , MCA_BTL_REG_FLAG_ACCESS_ANY ,
173
- & handle );
250
+ ret = ompi_osc_rdma_register (module , MCA_BTL_ENDPOINT_ANY , (void * ) region -> base , region -> len ,
251
+ MCA_BTL_REG_FLAG_ACCESS_ANY , & handle );
174
252
if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
175
253
OPAL_THREAD_UNLOCK (& module -> lock );
254
+ OBJ_RELEASE (rdma_region_handle );
176
255
return OMPI_ERR_RMA_ATTACH ;
177
256
}
178
257
179
258
memcpy (region -> btl_handle_data , handle , module -> selected_btl -> btl_registration_handle_size );
180
- module -> dynamic_handles [ region_index ]. btl_handle = handle ;
259
+ rdma_region_handle -> btl_handle = handle ;
181
260
} else {
182
- module -> dynamic_handles [ region_index ]. btl_handle = NULL ;
261
+ rdma_region_handle -> btl_handle = NULL ;
183
262
}
184
263
185
- module -> dynamic_handles [region_index ].refcnt = 1 ;
264
+ assert (OMPI_SUCCESS == ompi_osc_rdma_add_attachment (rdma_region_handle , (intptr_t ) base , len ));
265
+
266
+ module -> dynamic_handles [region_index ] = rdma_region_handle ;
186
267
187
268
#if OPAL_ENABLE_DEBUG
188
269
for (int i = 0 ; i < region_count + 1 ; ++ i ) {
@@ -211,34 +292,46 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
211
292
ompi_osc_rdma_module_t * module = GET_MODULE (win );
212
293
const int my_rank = ompi_comm_rank (module -> comm );
213
294
ompi_osc_rdma_peer_dynamic_t * my_peer = (ompi_osc_rdma_peer_dynamic_t * ) ompi_osc_rdma_module_peer (module , my_rank );
295
+ ompi_osc_rdma_handle_t * rdma_region_handle ;
214
296
osc_rdma_counter_t region_count , region_id ;
215
297
ompi_osc_rdma_region_t * region ;
216
- int region_index ;
298
+ void * bound ;
299
+ int start_index = INT_MAX , region_index ;
217
300
218
301
if (module -> flavor != MPI_WIN_FLAVOR_DYNAMIC ) {
219
302
return OMPI_ERR_WIN ;
220
303
}
221
304
222
305
OPAL_THREAD_LOCK (& module -> lock );
223
306
224
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "detach: %s, %p" , win -> w_name , base );
225
-
226
307
/* the upper 4 bytes of the region count are an instance counter */
227
308
region_count = module -> state -> region_count & 0xffffffffL ;
228
309
region_id = module -> state -> region_count >> 32 ;
229
310
230
- region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t * ) module -> state -> regions , 0 ,
231
- region_count - 1 , (intptr_t ) base , (intptr_t ) base + 1 ,
232
- module -> region_size , & region_index );
233
- if (NULL == region ) {
234
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "could not find dynamic memory region starting at %p" , base );
235
- OPAL_THREAD_UNLOCK (& module -> lock );
236
- return OMPI_ERROR ;
311
+ /* look up the associated region */
312
+ for (region_index = 0 ; region_index < region_count ; ++ region_index ) {
313
+ rdma_region_handle = module -> dynamic_handles [region_index ];
314
+ region = (ompi_osc_rdma_region_t * ) ((intptr_t ) module -> state -> regions + region_index * module -> region_size );
315
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "checking attachments at index %d {.base=%p, len=%lu} for attachment %p"
316
+ ", region handle=%p" , region_index , (void * ) region -> base , region -> len , base , rdma_region_handle );
317
+
318
+ if (region -> base > (uintptr_t ) base || (region -> base + region -> len ) < (uintptr_t ) base ) {
319
+ continue ;
320
+ }
321
+
322
+ if (OPAL_SUCCESS == ompi_osc_rdma_remove_attachment (rdma_region_handle , (intptr_t ) base )) {
323
+ break ;
324
+ }
237
325
}
238
326
239
- if (-- module -> dynamic_handles [region_index ].refcnt > 0 ) {
327
+ if (region_index == region_count ) {
328
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "could not find dynamic memory attachment for %p" , base );
240
329
OPAL_THREAD_UNLOCK (& module -> lock );
241
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "detach complete" );
330
+ return OMPI_ERR_BASE ;
331
+ }
332
+
333
+ if (!opal_list_is_empty (& rdma_region_handle -> attachments )) {
334
+ /* another region is referencing this attachment */
242
335
return OMPI_SUCCESS ;
243
336
}
244
337
@@ -249,21 +342,21 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
249
342
base , (void * )((intptr_t ) base + region -> len ), region_index );
250
343
251
344
if (module -> selected_btl -> btl_register_mem ) {
252
- ompi_osc_rdma_deregister (module , module -> dynamic_handles [ region_index ]. btl_handle );
345
+ ompi_osc_rdma_deregister (module , rdma_region_handle -> btl_handle );
253
346
254
- if (region_index < region_count - 1 ) {
255
- memmove (module -> dynamic_handles + region_index , module -> dynamic_handles + region_index + 1 ,
256
- (region_count - region_index - 1 ) * sizeof (void * ));
257
- }
258
-
259
- memset (module -> dynamic_handles + region_count - 1 , 0 , sizeof (module -> dynamic_handles [0 ]));
260
347
}
261
348
262
349
if (region_index < region_count - 1 ) {
350
+ size_t end_count = region_count - region_index - 1 ;
351
+ memmove (module -> dynamic_handles + region_index , module -> dynamic_handles + region_index + 1 ,
352
+ end_count * sizeof (module -> dynamic_handles [0 ]));
263
353
memmove (region , (void * )((intptr_t ) region + module -> region_size ),
264
- ( region_count - region_index - 1 ) * module -> region_size ); ;
354
+ end_count * module -> region_size );
265
355
}
266
356
357
+ OBJ_RELEASE (rdma_region_handle );
358
+ module -> dynamic_handles [region_count - 1 ] = NULL ;
359
+
267
360
module -> state -> region_count = ((region_id + 1 ) << 32 ) | (region_count - 1 );
268
361
269
362
ompi_osc_rdma_lock_release_exclusive (module , & my_peer -> super , offsetof (ompi_osc_rdma_state_t , regions_lock ));
0 commit comments