Merge pull request #10855 from wckzhang/buffer_id

wckzhang · web-flow · commit afd23c97d07e · 2022-09-27T10:02:04.000-07:00
opal/accelerator: Add get_buffer_id API
diff --git a/opal/class/opal_free_list.c b/opal/class/opal_free_list.c
@@ -52,7 +52,7 @@ static void opal_free_list_construct(opal_free_list_t *fl)
     fl->fl_mpool = NULL;
     fl->fl_rcache = NULL;
     /* default flags */
-    fl->fl_rcache_reg_flags = MCA_RCACHE_FLAGS_CACHE_BYPASS | MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM;
+    fl->fl_rcache_reg_flags = MCA_RCACHE_FLAGS_CACHE_BYPASS | MCA_RCACHE_FLAGS_ACCELERATOR_REGISTER_MEM;
     fl->ctx = NULL;
     OBJ_CONSTRUCT(&(fl->fl_allocations), opal_list_t);
 }
@@ -190,7 +190,7 @@ int opal_free_list_grow_st(opal_free_list_t *flist, size_t num_elements,
         buffer_size = num_elements * elem_size;
         align = flist->fl_payload_buffer_alignment;
 
-        if (MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM & flist->fl_rcache_reg_flags) {
+        if (MCA_RCACHE_FLAGS_ACCELERATOR_REGISTER_MEM & flist->fl_rcache_reg_flags) {
             size_t pagesize = opal_getpagesize();
             /* CUDA cannot handle registering overlapping regions, so make
              * sure each region is page sized and page aligned. */
diff --git a/opal/mca/accelerator/accelerator.h b/opal/mca/accelerator/accelerator.h
@@ -79,7 +79,6 @@
 
 BEGIN_C_DECLS
 
-
 #define MCA_ACCELERATOR_NO_DEVICE_ID -1
 /**
  * Accelerator flags
@@ -103,6 +102,8 @@ typedef enum {
     MCA_ACCELERATOR_TRANSFER_DTOD,
 } opal_accelerator_transfer_type_t;
 
+typedef uint64_t opal_accelerator_buffer_id_t;
+
 struct opal_accelerator_stream_t {
     opal_object_t super;
     /* Stream object */
@@ -359,6 +360,20 @@ typedef int (*opal_accelerator_base_module_get_device_fn_t)(
 typedef int (*opal_accelerator_base_module_device_can_access_peer_fn_t)(
     int *access, int dev1, int dev2);
 
+/**
+ * Retrieves current device id for a device associated with the local process.
+ * If MCA_ACCELERATOR_NO_DEVICE_ID is provided, there is no device/process pairing.
+ *
+ * @param[IN] dev_id         ID of the device or MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[IN] addr           Buffer pointer to check
+ * @param[OUT] buf_id        ID of the given buffer
+ *
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_get_buffer_id_fn_t)(
+    int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
+
 /*
  * the standard public API data structure
  */
@@ -384,6 +399,8 @@ typedef struct {
 
     opal_accelerator_base_module_get_device_fn_t get_device;
     opal_accelerator_base_module_device_can_access_peer_fn_t device_can_access_peer;
+
+    opal_accelerator_base_module_get_buffer_id_fn_t get_buffer_id;
 } opal_accelerator_base_module_t;
 
 /**
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -47,6 +47,8 @@ static int accelerator_cuda_host_unregister(int dev_id, void *ptr);
 static int accelerator_cuda_get_device(int *dev_id);
 static int accelerator_cuda_device_can_access_peer( int *access, int dev1, int dev2);
 
+static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
+
 opal_accelerator_base_module_t opal_accelerator_cuda_module =
 {
     accelerator_cuda_check_addr,
@@ -68,7 +70,9 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
     accelerator_cuda_host_unregister,
 
     accelerator_cuda_get_device,
-    accelerator_cuda_device_can_access_peer
+    accelerator_cuda_device_can_access_peer,
+
+    accelerator_cuda_get_buffer_id
 };
 
 static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags)
@@ -538,3 +542,30 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de
     }
     return 0;
 }
+
+/*
+ * Get the buffer ID from the memory.
+ * This is needed to ensure the cached registration is not stale.  If
+ * we fail to get buffer ID, print an error and set buffer ID to 0.
+ * Also set SYNC_MEMOPS on any GPU registration to ensure that
+ * synchronous copies complete before the buffer is accessed.
+ */
+static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id)
+{
+    CUresult result;
+    int enable = 1;
+    result = opal_accelerator_cuda_func.cuPointerGetAttribute((unsigned long long *)buf_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) addr);
+    if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
+        opal_show_help("help-accelerator-cuda.txt", "bufferID failed", true, OPAL_PROC_MY_HOSTNAME,
+                       result);
+        return result;
+    }
+    result = opal_accelerator_cuda_func.cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                       (CUdeviceptr) addr);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuPointerSetAttribute failed", true,
+                       OPAL_PROC_MY_HOSTNAME, result, addr);
+        return result;
+    }
+    return OPAL_SUCCESS;
+}
diff --git a/opal/mca/accelerator/null/accelerator_null_component.c b/opal/mca/accelerator/null/accelerator_null_component.c
@@ -61,6 +61,8 @@ static int accelerator_null_host_unregister(int dev_id, void *ptr);
 static int accelerator_null_get_device(int *dev_id);
 static int accelerator_null_device_can_access_peer(int *access, int dev1, int dev2);
 
+static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
+
 /*
  * Instantiate the public struct with all of our public information
  * and pointers to our public functions in it
@@ -120,7 +122,9 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
     accelerator_null_host_unregister,
 
     accelerator_null_get_device,
-    accelerator_null_device_can_access_peer
+    accelerator_null_device_can_access_peer,
+
+    accelerator_null_get_buffer_id
 };
 
 static int accelerator_null_open(void)
@@ -235,3 +239,8 @@ static int accelerator_null_device_can_access_peer( int *access, int dev1, int d
 {
     return OPAL_ERR_NOT_IMPLEMENTED;
 }
+
+static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id)
+{
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
@@ -37,6 +37,7 @@ static int mca_accelerator_rocm_host_unregister(int dev_id, void *ptr);
 static int mca_accelerator_rocm_get_device(int *dev_id);
 static int mca_accelerator_rocm_device_can_access_peer( int *access, int dev1, int dev2);
 
+static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
 opal_accelerator_base_module_t opal_accelerator_rocm_module =
 {
@@ -59,7 +60,9 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module =
     mca_accelerator_rocm_host_unregister,
 
     mca_accelerator_rocm_get_device,
-    mca_accelerator_rocm_device_can_access_peer
+    mca_accelerator_rocm_device_can_access_peer,
+
+    mca_accelerator_rocm_get_buffer_id
 };
 
 
@@ -483,3 +486,9 @@ static int mca_accelerator_rocm_device_can_access_peer(int *access, int dev1, in
 
     return OPAL_SUCCESS;
 }
+
+static int accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id)
+{
+    *buf_id = 0;
+    return OPAL_SUCCESS;
+}
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1006,7 +1006,7 @@ mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl,
 
 #if OPAL_CUDA_GDR_SUPPORT
     if (MCA_BTL_REG_FLAG_CUDA_GPU_MEM & flags) {
-        rcache_flags |= MCA_RCACHE_FLAGS_CUDA_GPU_MEM;
+        rcache_flags |= MCA_RCACHE_FLAGS_ACCELERATOR_MEM;
     }
 #endif
 
diff --git a/opal/mca/rcache/grdma/rcache_grdma_module.c b/opal/mca/rcache/grdma/rcache_grdma_module.c
@@ -42,9 +42,6 @@
 #include "opal/mca/rcache/base/base.h"
 #include "opal/mca/rcache/rcache.h"
 #include "opal/mca/accelerator/accelerator.h"
-#if OPAL_CUDA_GDR_SUPPORT
-#include "opal/cuda/common_cuda.h"
-#endif /* OPAL_CUDA_GDR_SUPPORT */
 #include "opal/align.h"
 #include "opal/util/sys_limits.h"
 #include "rcache_grdma.h"
@@ -61,6 +58,7 @@ static int mca_rcache_grdma_invalidate_range(mca_rcache_base_module_t *rcache, v
 static void mca_rcache_grdma_finalize(mca_rcache_base_module_t *rcache);
 static bool mca_rcache_grdma_evict(mca_rcache_base_module_t *rcache);
 static int mca_rcache_grdma_add_to_gc(mca_rcache_base_registration_t *grdma_reg);
+static int check_for_accelerator_freed_memory(mca_rcache_base_module_t *rcache, void *addr, size_t size);
 
 static inline bool registration_flags_cacheable(uint32_t flags)
 {
@@ -75,9 +73,6 @@ static inline bool registration_is_cacheable(mca_rcache_base_registration_t *reg
     return registration_flags_cacheable(reg->flags);
 }
 
-#if OPAL_CUDA_GDR_SUPPORT
-static int check_for_cuda_freed_memory(mca_rcache_base_module_t *rcache, void *addr, size_t size);
-#endif /* OPAL_CUDA_GDR_SUPPORT */
 static void mca_rcache_grdma_cache_contructor(mca_rcache_grdma_cache_t *cache)
 {
     memset((void *) ((uintptr_t) cache + sizeof(cache->super)), 0,
@@ -328,8 +323,7 @@ static int mca_rcache_grdma_register(mca_rcache_base_module_t *rcache, void *add
     base = OPAL_DOWN_ALIGN_PTR(addr, page_size, unsigned char *);
     bound = OPAL_ALIGN_PTR((intptr_t) addr + size, page_size, unsigned char *) - 1;
 
-#if OPAL_CUDA_GDR_SUPPORT
-    if (flags & MCA_RCACHE_FLAGS_CUDA_GPU_MEM) {
+    if (flags & MCA_RCACHE_FLAGS_ACCELERATOR_MEM) {
         size_t psize;
         int res = opal_accelerator.get_address_range(MCA_ACCELERATOR_NO_DEVICE_ID, addr, (void **)&base, &psize);
         if (OPAL_SUCCESS != res) {
@@ -338,9 +332,8 @@ static int mca_rcache_grdma_register(mca_rcache_base_module_t *rcache, void *add
         bound = base + psize - 1;
         /* Check to see if this memory is in the cache and if it has been freed. If so,
          * this call will boot it out of the cache. */
-        check_for_cuda_freed_memory(rcache, base, psize);
+        check_for_accelerator_freed_memory(rcache, base, psize);
     }
-#endif /* OPAL_CUDA_GDR_SUPPORT */
 
     do_unregistration_gc(rcache);
 
@@ -378,11 +371,9 @@ static int mca_rcache_grdma_register(mca_rcache_base_module_t *rcache, void *add
     grdma_reg->flags = flags;
     grdma_reg->access_flags = access_flags;
     grdma_reg->ref_count = 1;
-#if OPAL_CUDA_GDR_SUPPORT
-    if (flags & MCA_RCACHE_FLAGS_CUDA_GPU_MEM) {
-        mca_common_cuda_get_buffer_id(grdma_reg);
+    if (flags & MCA_RCACHE_FLAGS_ACCELERATOR_MEM) {
+        opal_accelerator.get_buffer_id(MCA_ACCELERATOR_NO_DEVICE_ID, grdma_reg->base, &grdma_reg->gpu_bufID);
     }
-#endif /* OPAL_CUDA_GDR_SUPPORT */
 
     while (OPAL_ERR_OUT_OF_RESOURCE
            == (rc = rcache_grdma->resources.register_mem(rcache_grdma->resources.reg_data, base,
@@ -538,15 +529,34 @@ static int mca_rcache_grdma_invalidate_range(mca_rcache_base_module_t *rcache, v
                                        &args);
 }
 
+/* Check to see if the memory was freed between the time it was stored in
+ * the registration cache and now.  Return true if the memory was previously
+ * freed.  This is indicated by the BUFFER_ID value in the registration cache
+ * not matching the BUFFER_ID of the buffer we are checking.  Return false
+ * if the registration is still good.
+ */
+static bool mca_rcache_accelerator_previously_freed_memory(mca_rcache_base_registration_t *reg)
+{
+    int res;
+    opal_accelerator_buffer_id_t buf_id;
+    unsigned char *dbuf = reg->base;
+    opal_accelerator.get_buffer_id(MCA_ACCELERATOR_NO_DEVICE_ID, dbuf, &buf_id);
+    if (OPAL_UNLIKELY(res != OPAL_SUCCESS)) {
+        return true;
+    }
+    if (buf_id != reg->gpu_bufID) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
 /* Make sure this registration request is not stale.  In other words, ensure
  * that we do not have a cuMemAlloc, cuMemFree, cuMemAlloc state.  If we do
  * kick out the regisrations and deregister.  This function needs to be called
  * with the rcache->vma_module->vma_lock held. */
-#if OPAL_CUDA_GDR_SUPPORT
-
-static int check_for_cuda_freed_memory(mca_rcache_base_module_t *rcache, void *addr, size_t size)
+static int check_for_accelerator_freed_memory(mca_rcache_base_module_t *rcache, void *addr, size_t size)
 {
-    unsigned long long buf_id;
     mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t *) rcache;
     mca_rcache_base_registration_t *reg;
 
@@ -556,7 +566,7 @@ static int check_for_cuda_freed_memory(mca_rcache_base_module_t *rcache, void *a
     }
 
     /* If not previously freed memory, just return 0 */
-    if (!(mca_common_cuda_previously_freed_memory(reg))) {
+    if (!(mca_rcache_accelerator_previously_freed_memory(reg))) {
         return OPAL_SUCCESS;
     }
 
@@ -566,7 +576,6 @@ static int check_for_cuda_freed_memory(mca_rcache_base_module_t *rcache, void *a
     return mca_rcache_base_vma_iterate(rcache_grdma->cache->vma_module, addr, size, true, gc_add,
                                        NULL);
 }
-#endif /* OPAL_CUDA_GDR_SUPPORT */
 
 static void mca_rcache_grdma_finalize(mca_rcache_base_module_t *rcache)
 {
diff --git a/opal/mca/rcache/rcache.h b/opal/mca/rcache/rcache.h
@@ -28,6 +28,7 @@
 #include "opal/mca/mca.h"
 #include "opal/mca/mpool/mpool.h"
 #include "opal/mca/threads/mutex.h"
+#include "opal/mca/accelerator/accelerator.h"
 
 /* forward-declaration of rcache module structure */
 struct mca_rcache_base_module_t;
@@ -40,10 +41,10 @@ enum {
     MCA_RCACHE_FLAGS_PERSIST = 0x0002,
     /** registation requires strong ordering (disables relaxed ordering) */
     MCA_RCACHE_FLAGS_SO_MEM = 0x0004,
-    /** address range is cuda buffer */
-    MCA_RCACHE_FLAGS_CUDA_GPU_MEM = 0x0008,
-    /** register with common cuda */
-    MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM = 0x0010,
+    /** address range is accelerator buffer */
+    MCA_RCACHE_FLAGS_ACCELERATOR_MEM = 0x0008,
+    /** register with accelerator framework */
+    MCA_RCACHE_FLAGS_ACCELERATOR_REGISTER_MEM = 0x0010,
     /** invalid registration (no valid for passing to rcache register) */
     MCA_RCACHE_FLAGS_INVALID = 0x0080,
     /** reserved for rcache module */
@@ -88,18 +89,16 @@ struct mca_rcache_base_registration_t {
     unsigned char *base;
     /** bound of registered region */
     unsigned char *bound;
-    /** artifact of old mpool/rcache architecture. used by cuda code */
+    /** artifact of old mpool/rcache architecture. */
     unsigned char *alloc_base;
     /** number of outstanding references */
     opal_atomic_int32_t ref_count;
     /** registration flags */
     opal_atomic_uint32_t flags;
     /** internal rcache context */
     void *rcache_context;
-#if OPAL_CUDA_GDR_SUPPORT
-    /** CUDA gpu buffer identifier */
-    unsigned long long gpu_bufID;
-#endif /* OPAL_CUDA_GDR_SUPPORT */
+    /** Accelerator buffer identifier */
+    opal_accelerator_buffer_id_t gpu_bufID;
     /** registration access flags */
     int32_t access_flags;
     unsigned char padding[64];

Original file line number	Diff line number	Diff line change
`@@ -1006,7 +1006,7 @@ mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl,`
`1006`	`1006`
`1007`	`1007`	`#if OPAL_CUDA_GDR_SUPPORT`
`1008`	`1008`	`if (MCA_BTL_REG_FLAG_CUDA_GPU_MEM & flags) {`
`1009`		`- rcache_flags \|= MCA_RCACHE_FLAGS_CUDA_GPU_MEM;`
	`1009`	`+ rcache_flags \|= MCA_RCACHE_FLAGS_ACCELERATOR_MEM;`
`1010`	`1010`	`}`
`1011`	`1011`	`#endif`
`1012`	`1012`