Skip to content

Commit 5025628

Browse files
authored
Merge pull request #7383 from hjelmn/fix_bug_7303_the_rcache_deadlock_v3
rcache/grdma: fix potential deadlock
2 parents c6831c5 + 14b6f49 commit 5025628

File tree

1 file changed

+39
-8
lines changed

1 file changed

+39
-8
lines changed

opal/mca/rcache/grdma/rcache_grdma_module.c

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
2020
* Copyright (c) 2016 Research Organization for Information Science
2121
* and Technology (RIST). All rights reserved.
22+
* Copyright (c) 2020 Google, LLC. All rights reserved.
2223
*
2324
* $COPYRIGHT$
2425
*
@@ -167,24 +168,56 @@ static inline void do_unregistration_gc (mca_rcache_base_module_t *rcache)
167168
dereg_mem ((mca_rcache_base_registration_t *) item);
168169
}
169170
}
171+
172+
static inline mca_rcache_base_registration_t *mca_rcache_grdma_remove_lru_head(mca_rcache_grdma_cache_t *cache) {
173+
mca_rcache_base_registration_t *old_reg;
174+
int32_t old_flags;
175+
176+
do {
177+
opal_mutex_lock (&cache->vma_module->vma_lock);
178+
old_reg = (mca_rcache_base_registration_t *) opal_list_remove_first (&cache->lru_list);
179+
if (NULL == old_reg) {
180+
opal_mutex_unlock (&cache->vma_module->vma_lock);
181+
break;
182+
}
183+
184+
do {
185+
int32_t new_flags;
186+
old_flags = old_reg->flags;
187+
/* registration has been selected for removal and is no longer in the LRU. mark it
188+
* as such. */
189+
new_flags = (old_flags & ~MCA_RCACHE_GRDMA_REG_FLAG_IN_LRU) | MCA_RCACHE_FLAGS_INVALID;
190+
if (opal_atomic_compare_exchange_strong_32(&old_reg->flags, &old_flags, new_flags)) {
191+
break;
192+
}
193+
} while (1);
194+
opal_mutex_unlock (&cache->vma_module->vma_lock);
195+
196+
if (old_flags & MCA_RCACHE_FLAGS_INVALID) {
197+
/* registration was already invalidated. in this case its fate is being determined
198+
* by another thread. */
199+
continue;
200+
}
201+
202+
return old_reg;
203+
} while (1);
204+
205+
return NULL;
206+
}
207+
170208
static inline bool mca_rcache_grdma_evict_lru_local (mca_rcache_grdma_cache_t *cache)
171209
{
172210
mca_rcache_grdma_module_t *rcache_grdma;
173211
mca_rcache_base_registration_t *old_reg;
174212

175-
opal_mutex_lock (&cache->vma_module->vma_lock);
176-
old_reg = (mca_rcache_base_registration_t *)
177-
opal_list_remove_first (&cache->lru_list);
213+
old_reg = mca_rcache_grdma_remove_lru_head(cache);
178214
if (NULL == old_reg) {
179-
opal_mutex_unlock (&cache->vma_module->vma_lock);
180215
return false;
181216
}
182217

183218
rcache_grdma = (mca_rcache_grdma_module_t *) old_reg->rcache;
184219

185220
(void) dereg_mem (old_reg);
186-
opal_mutex_unlock (&cache->vma_module->vma_lock);
187-
188221
rcache_grdma->stat_evicted++;
189222

190223
return true;
@@ -230,11 +263,9 @@ static inline void mca_rcache_grdma_remove_from_lru (mca_rcache_grdma_module_t *
230263

231264
/* opal lists are not thread safe at this time so we must lock :'( */
232265
opal_mutex_lock (&rcache_grdma->cache->vma_module->vma_lock);
233-
234266
opal_list_remove_item (&rcache_grdma->cache->lru_list, (opal_list_item_t *) grdma_reg);
235267
/* clear the LRU flag */
236268
grdma_reg->flags &= ~MCA_RCACHE_GRDMA_REG_FLAG_IN_LRU;
237-
238269
opal_mutex_unlock (&rcache_grdma->cache->vma_module->vma_lock);
239270
}
240271

0 commit comments

Comments
 (0)