Remove debug_synchronous from CUB call sites in FBGEMM ops (#1973)

q10 · facebook-github-bot · commit 6f0abb0502d4 · 2023-08-31T14:49:43.000-07:00
Summary: Pull Request resolved: #1973 - Remove debug_synchronous from CUB call sites in FBGEMM ops Reviewed By: sryap Differential Revision: D48722495 fbshipit-source-id: 47a92dc82e9fe271d719913f8a842f7fa2c8f36f
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_template.cu
@@ -524,8 +524,7 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
             linear_indices.numel(),
             0,
             total_hash_size_bits,
-            at::cuda::getCurrentCUDAStream(),
-            false));
+            at::cuda::getCurrentCUDAStream()));
         auto temp_storage = at::empty(
             {static_cast<int64_t>(temp_storage_bytes)},
             indices.options().dtype(at::kByte));
@@ -539,8 +538,7 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
             linear_indices.numel(),
             0,
             total_hash_size_bits,
-            at::cuda::getCurrentCUDAStream(),
-            false));
+            at::cuda::getCurrentCUDAStream()));
     }
     {%- endif %}
 
@@ -568,8 +566,7 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
                 linear_indices.numel(),
                 0,
                 total_hash_size_bits,
-                at::cuda::getCurrentCUDAStream(),
-                false));
+                at::cuda::getCurrentCUDAStream()));
             auto temp_storage = at::empty(
                 {static_cast<int64_t>(temp_storage_bytes)},
                 indices.options().dtype(at::kByte));
@@ -583,8 +580,7 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e
                 linear_indices.numel(),
                 0,
                 total_hash_size_bits,
-                at::cuda::getCurrentCUDAStream(),
-                false));
+                at::cuda::getCurrentCUDAStream()));
             }
             {%- endif %}
 
diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh
@@ -69,8 +69,7 @@ std::tuple<int32_t, uint32_t> adjust_info_B_num_bits(int32_t B, int32_t T);
       int num_items,                           \
       int begin_bit = 0,                       \
       int end_bit = sizeof(KeyT) * 8,          \
-      cudaStream_t stream = 0,                 \
-      bool debug_synchronous = false)
+      cudaStream_t stream = 0)
 
 DECL_RADIX_SORT_PAIRS_FN(int64_t, float);
 DECL_RADIX_SORT_PAIRS_FN(int64_t, double);
diff --git a/fbgemm_gpu/src/split_embeddings_utils.cu b/fbgemm_gpu/src/split_embeddings_utils.cu
@@ -21,6 +21,10 @@
 #include "fbgemm_gpu/cub_namespace_postfix.cuh"
 // clang-format on
 
+#ifdef __HIP_PLATFORM_HCC__
+#include <rocm_version.h>
+#endif
+
 inline at::Tensor asynchronous_complete_cumsum(at::Tensor t_in) {
   at::cuda::OptionalCUDAGuard device_guard;
   device_guard.set_index(t_in.get_device());
@@ -442,6 +446,32 @@ DLL_PUBLIC std::tuple<int32_t, uint32_t> adjust_info_B_num_bits(
   return {info_B_num_bits, info_B_mask};
 }
 
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
+#define DEF_RADIX_SORT_PAIRS_FN(KeyT, ValueT)                        \
+  DLL_PUBLIC cudaError_t radix_sort_pairs(                           \
+      void* d_temp_storage,                                          \
+      size_t& temp_storage_bytes,                                    \
+      const KeyT* d_keys_in,                                         \
+      KeyT* d_keys_out,                                              \
+      const ValueT* d_values_in,                                     \
+      ValueT* d_values_out,                                          \
+      const int num_items,                                           \
+      const int begin_bit,                                           \
+      const int end_bit,                                             \
+      cudaStream_t stream) {                                         \
+    return FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs( \
+        d_temp_storage,                                              \
+        temp_storage_bytes,                                          \
+        d_keys_in,                                                   \
+        d_keys_out,                                                  \
+        d_values_in,                                                 \
+        d_values_out,                                                \
+        num_items,                                                   \
+        begin_bit,                                                   \
+        end_bit,                                                     \
+        stream);                                                     \
+  }
+#else
 #define DEF_RADIX_SORT_PAIRS_FN(KeyT, ValueT)                        \
   DLL_PUBLIC cudaError_t radix_sort_pairs(                           \
       void* d_temp_storage,                                          \
@@ -453,8 +483,7 @@ DLL_PUBLIC std::tuple<int32_t, uint32_t> adjust_info_B_num_bits(
       const int num_items,                                           \
       const int begin_bit,                                           \
       const int end_bit,                                             \
-      cudaStream_t stream,                                           \
-      const bool debug_synchronous) {                                \
+      cudaStream_t stream) {                                         \
     return FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs( \
         d_temp_storage,                                              \
         temp_storage_bytes,                                          \
@@ -466,8 +495,9 @@ DLL_PUBLIC std::tuple<int32_t, uint32_t> adjust_info_B_num_bits(
         begin_bit,                                                   \
         end_bit,                                                     \
         stream,                                                      \
-        debug_synchronous);                                          \
+        false);                                                      \
   }
+#endif
 
 DEF_RADIX_SORT_PAIRS_FN(int64_t, float);
 DEF_RADIX_SORT_PAIRS_FN(int64_t, double);