Invoke AMD specific kernel reorder_batched_ad_indices_kernel_vec (#4412)

ghq24int · facebook-github-bot · commit 357125837093 · 2025-07-10T11:01:14.000-07:00
Summary: Pull Request resolved: #4412 X-link: facebookresearch/FBGEMM#1483 For the benchmark in the codebase, the larger the profuct of length and num-ads is, the better performance. Two optimization: 1. Vector loading in a warp. 2. The product of batch-size and table-size determines the # of thread blocks (https://www.internalfb.com/code/fbsource/[cecfed562b79afad0eb9c44259141f50352da342]/fbcode/deeplearning/fbgemm/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu?lines=361). In MRS models, we expect more thread blocks in our user cases. As such, we shrink the block size to achieve more thread blocks, thus improving compute utilization. Performance results and local test benchmarks: D77066925 Reviewed By: jwfromm, jianyuh, q10 Differential Revision: D77459476 fbshipit-source-id: 178a111cbcc67a59986410027bacbe75fc92ab26
diff --git a/fbgemm_gpu/src/sparse_ops/common.cuh b/fbgemm_gpu/src/sparse_ops/common.cuh
@@ -32,6 +32,7 @@
 #include "fbgemm_gpu/split_embeddings_utils.cuh"
 #include "fbgemm_gpu/utils/binary_search_range.cuh"
 #include "fbgemm_gpu/utils/dispatch_macros.h"
+#include "fbgemm_gpu/utils/kernel_launcher.cuh"
 #include "fbgemm_gpu/utils/log2.h"
 #include "fbgemm_gpu/utils/tensor_accessor_builder.h"
 
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu b/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu
@@ -300,7 +300,7 @@ __launch_bounds__(fbgemm_gpu::kMaxThreads) void reorder_batched_ad_indices_kerne
     // Idea: we want to copy the entire segment of size sum_a(length_{b, t, a})
     // from starting point (given by cat_ad_offsets[b, t])
     // to end point (given by reordered_cat_ad_indices[t][b])
-    if (num_elements <= 64) {
+    if (num_elements <= 64 || !(sizeof(Dtype) == 4 || sizeof(Dtype) == 8)) {
       for (auto i = threadIdx.x; i < input_segment_end - input_segment_start;
            i += blockDim.x) {
         // coalesced global memory access, can be optimzed through ILP with the
@@ -450,11 +450,6 @@ DLL_PUBLIC Tensor reorder_batched_ad_indices_gpu(
       return reordered_cat_ad_indices;
     }
   }
-  constexpr auto NUM_WARPS = 32;
-  auto maxWarpSize = kMaxThreads / NUM_WARPS;
-  const dim3 threads(
-      NUM_WARPS, maxWarpSize < kWarpSize ? maxWarpSize : kWarpSize); // 32 x 32
-  const dim3 blocks(cuda_calc_xblock_count(B * T, NUM_WARPS));
   FBGEMM_DISPATCH_ALL_TYPES(
       cat_ad_indices.scalar_type(),
       "reorder_batched_ad_indices_gpu_kernel_1",
@@ -463,24 +458,35 @@ DLL_PUBLIC Tensor reorder_batched_ad_indices_gpu(
             cat_ad_offsets.scalar_type(),
             "reorder_batched_ad_indices_gpu_kernel_2",
             [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-              const auto func_name = "reorder_batched_ad_indices_kernel";
+#if defined __HIP_PLATFORM_AMD__
+              constexpr auto NUM_WARPS = 4;
+              const dim3 threads(32, NUM_WARPS); // 32 x 4
+              const dim3 blocks(cuda_calc_xblock_count(B * T, NUM_WARPS));
+              constexpr auto reorder_batched_ad_indices_kernel_name =
+                  reorder_batched_ad_indices_kernel_vec<scalar_t, index_t>;
+#else
+              constexpr auto NUM_WARPS = 32;
+              auto maxWarpSize = kMaxThreads / NUM_WARPS;
+              const dim3 threads(
+                  NUM_WARPS,
+                  maxWarpSize < kWarpSize ? maxWarpSize : kWarpSize); // 32 x 32
+              const dim3 blocks(cuda_calc_xblock_count(B * T, NUM_WARPS));
+              constexpr auto reorder_batched_ad_indices_kernel_name =
+                  reorder_batched_ad_indices_kernel<scalar_t, index_t>;
 #endif
-              reorder_batched_ad_indices_kernel<scalar_t, index_t>
-                  <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                      MAKE_PTA_WITH_NAME(
-                          func_name, cat_ad_offsets, index_t, 1, 32),
-                      MAKE_PTA_WITH_NAME(
-                          func_name, cat_ad_indices, scalar_t, 1, 32),
-                      MAKE_PTA_WITH_NAME(
-                          func_name, reordered_cat_ad_offsets, index_t, 1, 32),
-                      MAKE_PTA_WITH_NAME(
-                          func_name, reordered_cat_ad_indices, scalar_t, 1, 32),
-                      MAKE_PTA_WITH_NAME(
-                          func_name, batch_offsets, int32_t, 1, 32),
-                      T,
-                      broadcast_indices);
-              C10_CUDA_KERNEL_LAUNCH_CHECK();
+              FBGEMM_LAUNCH_KERNEL(
+                  (reorder_batched_ad_indices_kernel_name),
+                  blocks,
+                  threads,
+                  0,
+                  at::cuda::getCurrentCUDAStream(),
+                  PTA_B(cat_ad_offsets, index_t, 1, 32),
+                  PTA_B(cat_ad_indices, scalar_t, 1, 32),
+                  PTA_B(reordered_cat_ad_offsets, index_t, 1, 32),
+                  PTA_B(reordered_cat_ad_indices, scalar_t, 1, 32),
+                  PTA_B(batch_offsets, int32_t, 1, 32),
+                  T,
+                  broadcast_indices);
             });
       });