Skip to content

Commit d61d384

Browse files
authored
Merge pull request #11275 from wckzhang/dynamiccheck
osc/rdma: Add checks for gpu support during dynamic attach
2 parents d27799e + 6d383a0 commit d61d384

File tree

3 files changed

+48
-27
lines changed

3 files changed

+48
-27
lines changed

ompi/mca/osc/rdma/osc_rdma.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#include "ompi/mca/osc/osc.h"
4545
#include "ompi/mca/osc/base/base.h"
4646
#include "opal/mca/btl/btl.h"
47+
#include "opal/mca/btl/base/base.h"
4748
#include "opal/mca/btl/base/btl_base_am_rdma.h"
4849
#include "ompi/memchecker.h"
4950
#include "ompi/op/op.h"
@@ -758,4 +759,36 @@ static inline int osc_rdma_is_accel(void *buf)
758759
uint64_t flags;
759760
return opal_accelerator.check_addr(buf, &dev_id, &flags);
760761
}
762+
763+
/**
764+
* @brief Checks whether any btl's are capable of accelerator support.
765+
*
766+
* @param[in] btls The modules to check for btl accelerator support
767+
*
768+
* @returns true If any btl module has accelerator support
769+
* @returns false Otherwise
770+
*/
771+
static inline bool osc_rdma_btl_accel_support(opal_list_t *btls)
772+
{
773+
mca_btl_base_selected_module_t* selected_btl;
774+
/* verify if we have any btls available. Since we do not verify
775+
* connectivity across all btls in the alternate case, this is as
776+
* good a test as we are going to have for success. */
777+
if (opal_list_is_empty(btls)) {
778+
return false;
779+
}
780+
781+
/* If we have any btls, we check again if any btl supports
782+
* MCA_BTL_FLAGS_ACCELERATOR_RDMA */
783+
OPAL_LIST_FOREACH(selected_btl, btls, mca_btl_base_selected_module_t) {
784+
opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output,
785+
"osc_rdma_component_query: check ACCELERATOR_RDMA flag: %s",
786+
selected_btl->btl_component->btl_version.mca_component_name);
787+
mca_btl_base_module_t *btl = selected_btl->btl_module;
788+
if (btl->btl_flags & MCA_BTL_FLAGS_ACCELERATOR_RDMA) {
789+
return true;
790+
}
791+
}
792+
return false;
793+
}
761794
#endif /* OMPI_OSC_RDMA_H */

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -373,10 +373,6 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
373373
int flavor)
374374
{
375375

376-
opal_list_t *btls = NULL;
377-
mca_btl_base_selected_module_t* selected_btl;
378-
int gpu_check = 0;
379-
380376
if (MPI_WIN_FLAVOR_SHARED == flavor) {
381377
return -1;
382378
}
@@ -385,7 +381,9 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
385381
uint64_t flags;
386382
int dev_id;
387383
if (opal_accelerator.check_addr(*base, &dev_id, &flags)) {
388-
gpu_check = 1;
384+
if (!osc_rdma_btl_accel_support(&mca_btl_base_modules_initialized)) {
385+
return -1;
386+
}
389387
}
390388
}
391389

@@ -396,28 +394,6 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
396394
return -1;
397395
}
398396

399-
/* Not on GPU at all, skip the check */
400-
if (!gpu_check) {
401-
goto ok;
402-
}
403-
404-
/* If we have any btls, we check again if any btl supports
405-
* MCA_BTL_FLAGS_ACCELERATOR_RDMA */
406-
btls = &mca_btl_base_modules_initialized;
407-
OPAL_LIST_FOREACH(selected_btl, btls, mca_btl_base_selected_module_t) {
408-
opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output,
409-
"osc_rdma_component_query: check ACCELERATOR_RDMA flag: %s",
410-
selected_btl->btl_component->btl_version.mca_component_name);
411-
mca_btl_base_module_t *btl = selected_btl->btl_module;
412-
// Check flag: MCA_BTL_FLAGS_ACCELERATOR_RDMA
413-
if (btl->btl_flags & MCA_BTL_FLAGS_ACCELERATOR_RDMA) {
414-
goto ok;
415-
}
416-
}
417-
/* No BTL supports the accelerator flag */
418-
return -1;
419-
420-
ok:
421397
return mca_osc_rdma_component.priority;
422398
}
423399

ompi/mca/osc/rdma/osc_rdma_dynamic.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
172172
intptr_t page_size = opal_getpagesize ();
173173
int region_index, ret;
174174
size_t aligned_len;
175+
int dev_id;
176+
uint64_t flags;
175177

176178
if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
177179
return OMPI_ERR_RMA_FLAVOR;
@@ -182,6 +184,16 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
182184
return OMPI_SUCCESS;
183185
}
184186

187+
/* Component query check doesn't apply for dynamic window types.
188+
* We have to check whether a GPU window is supported here. */
189+
if (opal_accelerator.check_addr(base, &dev_id, &flags)) {
190+
if (!osc_rdma_btl_accel_support(&mca_btl_base_modules_initialized)) {
191+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "Failed to attach %p. No components capable of attaching accelerator buffers.",
192+
base);
193+
return OMPI_ERR_NOT_SUPPORTED;
194+
}
195+
}
196+
185197
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attach: %s, %p, %lu", win->w_name, base, (unsigned long) len);
186198

187199
OPAL_THREAD_LOCK(&module->lock);

0 commit comments

Comments
 (0)