Migrate jagged tensor kernels to FBGEMM_LAUNCH_KERNEL, pt 2 (pytorch#4350)

q10 · facebook-github-bot · commit 97c191feb7fa · 2025-06-26T18:55:16.000-07:00
Summary: Pull Request resolved: pytorch#4350 X-link: facebookresearch/FBGEMM#1417 - Migrate jagged tensor kernels to `FBGEMM_LAUNCH_KERNEL`, pt 2 Reviewed By: r-barnes Differential Revision: D74974179 fbshipit-source-id: bc9595118bf8de92cadc79e52b83439ea855516b
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu
@@ -80,20 +80,16 @@ Tensor batched_dense_vec_jagged_2d_mul_forward(
               a_values.scalar_type(),
               "dense_vec_jagged_2d_bmm_kernel_2",
               [&] {
-
-#ifdef FBGEMM_GPU_MEMCHECK
-                const auto func_name1 = "dense_vec_jagged_2d_bmm";
-#endif
-                dense_vec_jagged_2d_bmm<index_t, scalar_t>
-                    <<<div_round_up(B * H, block_dim_y),
-                       dim3(block_dim_x, block_dim_y),
-                       0,
-                       at::cuda::getCurrentCUDAStream()>>>(
-                        MAKE_PTA_WITH_NAME(func_name1, v, scalar_t, 2, 32),
-                        MAKE_PTA_WITH_NAME(func_name1, a_values, scalar_t, 2, 32),
-                        MAKE_PTA_WITH_NAME(func_name1, a_offsets, index_t, 1, 32),
-                        MAKE_PTA_WITH_NAME(func_name1, output, scalar_t, 2, 32));
-                C10_CUDA_KERNEL_LAUNCH_CHECK();
+                FBGEMM_LAUNCH_KERNEL(
+                  (dense_vec_jagged_2d_bmm<index_t, scalar_t>),
+                  div_round_up(B * H, block_dim_y),
+                  dim3(block_dim_x, block_dim_y),
+                  0,
+                  at::cuda::getCurrentCUDAStream(),
+                  PTA_B(v, scalar_t, 2, 32),
+                  PTA_B(a_values, scalar_t, 2, 32),
+                  PTA_B(a_offsets, index_t, 1, 32),
+                  PTA_B(output, scalar_t, 2, 32));
               });
         });
   }
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_index_add_2d_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_index_add_2d_forward.cu
@@ -100,23 +100,18 @@ Tensor jagged_index_add_2d_forward_cuda(
               indices.scalar_type(),
               "jagged_index_add_2d_kernel_wrapper_2",
               [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-                const auto func_name = "jagged_index_add_2d_kernel";
-#endif
-                jagged_index_add_2d_kernel<<<
+                FBGEMM_LAUNCH_KERNEL(
+                    (jagged_index_add_2d_kernel<index_t, int64_t, scalar_t>),
                     dim3(num_blocks),
                     dim3(num_cols),
                     0,
-                    at::cuda::getCurrentCUDAStream()>>>(
-                    MAKE_PTA_WITH_NAME(func_name, output, scalar_t, 2, 64),
-                    MAKE_PTA_WITH_NAME(func_name, values, scalar_t, 2, 64),
-                    MAKE_PTA_WITH_NAME(
-                        func_name, (*input_offsets_contig), int64_t, 1, 32),
-                    MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 32),
-                    MAKE_PTA_WITH_NAME(
-                        func_name, output_offsets, int64_t, 1, 32),
+                    at::cuda::getCurrentCUDAStream(),
+                    PTA_B(output, scalar_t, 2, 64),
+                    PTA_B(values, scalar_t, 2, 64),
+                    PTA_B((*input_offsets_contig), int64_t, 1, 32),
+                    PTA_B(indices, index_t, 1, 32),
+                    PTA_B(output_offsets, int64_t, 1, 32),
                     num_dense_input_rows);
-                C10_CUDA_KERNEL_LAUNCH_CHECK();
               });
         });
   }
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_index_select_2d_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_index_select_2d_forward.cu
@@ -96,23 +96,18 @@ Tensor jagged_index_select_2d_forward_cuda(
               indices.scalar_type(),
               "jagged_index_select_2d_kernel_wrapper_2",
               [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-                const auto func_name = "jagged_index_select_2d_kernel";
-#endif
-                jagged_index_select_2d_kernel<<<
+                FBGEMM_LAUNCH_KERNEL(
+                    (jagged_index_select_2d_kernel<index_t, int64_t, scalar_t>),
                     dim3(num_blocks),
                     dim3(num_cols),
                     0,
-                    at::cuda::getCurrentCUDAStream()>>>(
-                    MAKE_PTA_WITH_NAME(func_name, output, scalar_t, 2, 64),
-                    MAKE_PTA_WITH_NAME(func_name, values, scalar_t, 2, 64),
-                    MAKE_PTA_WITH_NAME(
-                        func_name, input_offsets, int64_t, 1, 32),
-                    MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 32),
-                    MAKE_PTA_WITH_NAME(
-                        func_name, (*output_offsets_contig), int64_t, 1, 32),
+                    at::cuda::getCurrentCUDAStream(),
+                    PTA_B(output, scalar_t, 2, 64),
+                    PTA_B(values, scalar_t, 2, 64),
+                    PTA_B(input_offsets, int64_t, 1, 32),
+                    PTA_B(indices, index_t, 1, 32),
+                    PTA_B(*output_offsets_contig, int64_t, 1, 32),
                     num_dense_output_rows);
-                C10_CUDA_KERNEL_LAUNCH_CHECK();
               });
         });
   }
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_softmax_backward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_softmax_backward.cu
@@ -112,27 +112,20 @@ Tensor jagged_softmax_backward_cuda(
               grad_output.scalar_type(),
               "jagged_softmax_backward_kernel_2",
               [&] {
-
-#ifdef FBGEMM_GPU_MEMCHECK
-                const auto func_name1 = "jagged_softmax_backward_kernel";
-#endif
-
-                jagged_softmax_backward_kernel<
+                FBGEMM_LAUNCH_KERNEL(
+                    (jagged_softmax_backward_kernel<
+                        THREADS_PER_BLOCK,
+                        index_t,
+                        scalar_t>),
+                    grid,
                     THREADS_PER_BLOCK,
-                    index_t,
-                    scalar_t>
-                    <<<grid,
-                       THREADS_PER_BLOCK,
-                       0,
-                       at::cuda::getCurrentCUDAStream()>>>(
-                        MAKE_PTA_WITH_NAME(
-                            func_name1, grad_output, scalar_t, 2, 32),
-                        MAKE_PTA_WITH_NAME(func_name1, output, scalar_t, 2, 32),
-                        MAKE_PTA_WITH_NAME(func_name1, offsets, index_t, 1, 32),
-                        MAKE_PTA_WITH_NAME(
-                            func_name1, grad_input, scalar_t, 2, 32),
-                        (int)max_L);
-                C10_CUDA_KERNEL_LAUNCH_CHECK();
+                    0,
+                    at::cuda::getCurrentCUDAStream(),
+                    PTA_B(grad_output, scalar_t, 2, 32),
+                    PTA_B(output, scalar_t, 2, 32),
+                    PTA_B(offsets, index_t, 1, 32),
+                    PTA_B(grad_input, scalar_t, 2, 32),
+                    (int)max_L);
               });
         });
   }
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_softmax_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_softmax_forward.cu
@@ -133,21 +133,19 @@ Tensor jagged_softmax_forward_cuda(
         offsets.scalar_type(), "jagged_softmax_kernel_1", [&] {
           FBGEMM_DISPATCH_FLOATING_TYPES(
               values.scalar_type(), "jagged_softmax_kernel_2", [&] {
-
-#ifdef FBGEMM_GPU_MEMCHECK
-                const auto func_name1 = "jagged_softmax_kernel";
-#endif
-
-                jagged_softmax_kernel<THREADS_PER_BLOCK, index_t, scalar_t>
-                    <<<grid,
-                       THREADS_PER_BLOCK,
-                       0,
-                       at::cuda::getCurrentCUDAStream()>>>(
-                        MAKE_PTA_WITH_NAME(func_name1, values, scalar_t, 2, 32),
-                        MAKE_PTA_WITH_NAME(func_name1, offsets, index_t, 1, 32),
-                        MAKE_PTA_WITH_NAME(func_name1, output, scalar_t, 2, 32),
-                        (int)max_L);
-                C10_CUDA_KERNEL_LAUNCH_CHECK();
+                FBGEMM_LAUNCH_KERNEL(
+                    (jagged_softmax_kernel<
+                        THREADS_PER_BLOCK,
+                        index_t,
+                        scalar_t>),
+                    grid,
+                    THREADS_PER_BLOCK,
+                    0,
+                    at::cuda::getCurrentCUDAStream(),
+                    PTA_B(values, scalar_t, 2, 32),
+                    PTA_B(offsets, index_t, 1, 32),
+                    PTA_B(output, scalar_t, 2, 32),
+                    (int)max_L);
               });
         });
   }