Error check some CUDA API calls (#1626)

r-barnes · facebook-github-bot · commit ce8374609bf2 · 2023-03-05T23:35:34.000-08:00
Summary: Pull Request resolved: #1626 Reviewed By: sryap Differential Revision: D43787029 fbshipit-source-id: 87e07acf39010d489366d3e4ea10b9a33dec1fd5
diff --git a/fbgemm_gpu/include/fbgemm_gpu/bench_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/bench_utils.cuh
@@ -31,8 +31,8 @@ void flush_cache(int cache_size_mb = 40, bool do_write = false) {
   CUDA_CHECK(
       cudaMemcpy(d_flush, flush.data(), cache_size, cudaMemcpyHostToDevice));
   flush_gpu<<<cache_size / 512, 512>>>(d_flush, d_flush2, do_write);
-  cudaFree(d_flush);
-  cudaFree(d_flush2);
+  CUDA_CHECK(cudaFree(d_flush));
+  CUDA_CHECK(cudaFree(d_flush2));
   CUDA_CHECK(cudaDeviceSynchronize());
   CUDA_CHECK(cudaGetLastError());
 }
diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu
@@ -651,10 +651,10 @@ bool jagged_dense_dense_elementwise_jagged_output_matches_opt(
 
   int max_shared_bytes;
 #ifndef __HIP_PLATFORM_HCC__
-  cudaDeviceGetAttribute(
+  C10_CUDA_CHECK(cudaDeviceGetAttribute(
       &max_shared_bytes,
       cudaDevAttrMaxSharedMemoryPerBlockOptin,
-      y_0_reshaped.get_device());
+      y_0_reshaped.get_device()));
 #else
   // MI100 has 64 KB local memory (shared memory) per workgroup
   max_shared_bytes = 64 << 10;
@@ -769,10 +769,10 @@ void jagged_dense_elementwise_jagged_output_opt_(
           if (dynamic_smem_size > cur_max_shared_bytes) {
             int max_shared_bytes;
 #ifndef __HIP_PLATFORM_HCC__
-            cudaDeviceGetAttribute(
+            C10_CUDA_CHECK(cudaDeviceGetAttribute(
                 &max_shared_bytes,
                 cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                y_reshaped.get_device());
+                y_reshaped.get_device()));
 #else
             // MI100 has 64 KB local memory (shared memory) per workgroup
             max_shared_bytes = 64 << 10;
@@ -788,11 +788,11 @@ void jagged_dense_elementwise_jagged_output_opt_(
 #endif
             int used_shared_bytes = used_shared_kb << 10;
 #ifndef __HIP_PLATFORM_HCC__
-            cudaFuncSetAttribute(
+            C10_CUDA_CHECK(cudaFuncSetAttribute(
                 jagged_dense_dense_elementwise_jagged_output_opt_search_kernel_<
                     index_t>,
                 cudaFuncAttributeMaxDynamicSharedMemorySize,
-                used_shared_bytes); // V100: 64 KB; A100: 96 KB.
+                used_shared_bytes)); // V100: 64 KB; A100: 96 KB.
 #endif
             C10_CUDA_KERNEL_LAUNCH_CHECK();
             TORCH_CHECK(dynamic_smem_size <= used_shared_bytes);
@@ -973,10 +973,10 @@ void jagged_dense_dense_elementwise_jagged_output_opt_(
           if (dynamic_smem_size > cur_max_shared_bytes) {
             int max_shared_bytes;
 #ifndef __HIP_PLATFORM_HCC__
-            cudaDeviceGetAttribute(
+            C10_CUDA_CHECK(cudaDeviceGetAttribute(
                 &max_shared_bytes,
                 cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                y_0_reshaped.get_device());
+                y_0_reshaped.get_device()));
 #else
             // MI100 has 64 KB local memory (shared memory) per workgroup
             max_shared_bytes = 64 << 10;
@@ -992,11 +992,11 @@ void jagged_dense_dense_elementwise_jagged_output_opt_(
 #endif
             int used_shared_bytes = used_shared_kb << 10;
 #ifndef __HIP_PLATFORM_HCC__
-            cudaFuncSetAttribute(
+            C10_CUDA_CHECK(cudaFuncSetAttribute(
                 jagged_dense_dense_elementwise_jagged_output_opt_search_kernel_<
                     index_t>,
                 cudaFuncAttributeMaxDynamicSharedMemorySize,
-                used_shared_bytes); // V100: 64 KB; A100: 96 KB.
+                used_shared_bytes)); // V100: 64 KB; A100: 96 KB.
 #endif
             C10_CUDA_KERNEL_LAUNCH_CHECK();
             TORCH_CHECK(dynamic_smem_size <= used_shared_bytes);
diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
@@ -274,10 +274,11 @@ void init_p2p_access() {
       for (const auto j : c10::irange(at::cuda::getNumGPUs())) {
         if (i != j) {
           at::cuda::CUDAGuard g(i);
-          const auto err = cudaDeviceEnablePeerAccess(j, 0);
+          const auto err =
+              C10_CUDA_ERROR_HANDLED(cudaDeviceEnablePeerAccess(j, 0));
           if (err == cudaErrorPeerAccessAlreadyEnabled) {
             // ignore and clear the error if access was already enabled
-            cudaGetLastError();
+            C10_CUDA_CLEAR_ERROR();
           } else {
             AT_CUDA_CHECK(err);
           }
diff --git a/fbgemm_gpu/src/topology_utils.cpp b/fbgemm_gpu/src/topology_utils.cpp
@@ -7,6 +7,7 @@
 
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/core/Device.h>
+#include <c10/cuda/CUDAException.h>
 #include <algorithm>
 
 #include "fbgemm_gpu/topology_utils.h"
@@ -131,14 +132,15 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
         &pci_info.busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
         pci_bus_id.data());
     int32_t node = 0;
-    auto err = cudaDeviceGetByPCIBusId(&node, pci_bus_id.data());
+    auto err = C10_CUDA_ERROR_HANDLED(
+        cudaDeviceGetByPCIBusId(&node, pci_bus_id.data()));
     if (err == cudaSuccess) {
       pci_bus_ids.insert({pci_bus_id, node});
       cuda_device_to_nvml_device.insert({node, i});
     } else {
       // flush the last error - this can occur when e.g. we set
       // CUDA_VISIBLE_DEVICES to a subset of the available GPUs in the system.
-      cudaGetLastError();
+      C10_CUDA_CLEAR_ERROR();
     }
   }