Add set_max_dynamic_smem (pytorch#4398)

q10 · facebook-github-bot · commit e9ce63d91ba1 · 2025-06-26T11:37:46.000-07:00
Summary: Pull Request resolved: pytorch#4398 X-link: facebookresearch/FBGEMM#1469 - Fold out duplicate code with setting `cudaFuncAttributeMaxDynamicSharedMemorySize` into `set_max_dynamic_smem` Reviewed By: jianyuh, ionuthristodorescu Differential Revision: D76700646 fbshipit-source-id: 01c4b651735f3b1c5c5d24d0af9b13ccd4da7398
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -24,6 +24,7 @@
 #include "fbgemm_gpu/sparse_ops.h"
 #include "fbgemm_gpu/config/feature_gates.h"
 #include "fbgemm_gpu/split_embeddings_utils.cuh"
+#include "fbgemm_gpu/utils/cuda_utilities.cuh"
 #include "fbgemm_gpu/utils/kernel_launcher.cuh"
 #include "fbgemm_gpu/utils/ops_utils.h"
 #include "fbgemm_gpu/utils/tensor_accessor_builder.h"
@@ -483,20 +484,8 @@ int32_t compute_num_groups_and_dynamic_smem_bytes(
   }
   TORCH_CHECK_GE(*num_groups, 1);
 
-  // Check https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-  // "Compute capability 7.x devices allow a single thread block to
-  // address the full capacity of shared memory: 96 KB on Volta,
-  // 64 KB on Turing. Kernels relying on shared memory allocations
-  // over 48 KB per block are architecture-specific, as such they
-  // must use dynamic shared memory (rather than statically sized
-  // arrays) and require an explicit opt-in using cudaFuncSetAttribute()".
-#ifndef USE_ROCM
-  cudaFuncSetAttribute(
-      bwd_kernel_fn,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      used_shared_bytes); // V100: 64 KB; A100: 96 KB; H100: 144 KB
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-#endif
+  utils::cuda::set_max_dynamic_smem(bwd_kernel_fn, used_shared_bytes);
+
   return smem_bytes;
 }
 
diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu
@@ -37,7 +37,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 #include "fbgemm_gpu/utils/ops_utils.h"
 {%- endif %}
-#include "fbgemm_gpu/utils/device_properties.cuh"
+#include "fbgemm_gpu/utils/cuda_utilities.cuh"
 #include "fbgemm_gpu/utils/kernel_launcher.cuh"
 #include "fbgemm_gpu/embedding_forward_template_helpers.cuh"
 #include "fbgemm_gpu/split_embeddings_cache_cuda.cuh"
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_utilities.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/cuda_utilities.cuh
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAStream.h>
+#include <cuda.h>
+
+namespace fbgemm_gpu::utils::cuda {
+
+// Based on the empirical study, max grid size that is 64x larger than the
+// number of SMs gives good performance across the board
+constexpr int32_t MAX_THREAD_BLOCKS_FACTOR = 64;
+
+inline auto get_max_thread_blocks(const c10::cuda::CUDAStream& stream) {
+  const auto device = stream.device_index();
+  return MAX_THREAD_BLOCKS_FACTOR *
+      at::cuda::getDeviceProperties(device)->multiProcessorCount;
+}
+
+inline auto get_compute_versions() {
+  static const auto versions = [] {
+    int runtime_version = 0;
+    cudaRuntimeGetVersion(&runtime_version);
+
+    int driver_version = 0;
+    cudaDriverGetVersion(&driver_version);
+
+    return std::make_tuple(runtime_version, driver_version);
+  }();
+
+  return versions;
+}
+
+template <typename func_t>
+inline void set_max_dynamic_smem(
+    func_t kernel,
+    const int32_t smem_bytes,
+    const int32_t device = at::cuda::current_device()) {
+#ifndef USE_ROCM
+
+  // Check
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
+  // "Compute capability 7.x devices allow a single thread block to
+  // address the full capacity of shared memory: 96 KB on Volta,
+  // 64 KB on Turing. Kernels relying on shared memory allocations
+  // over 48 KB per block are architecture-specific, as such they
+  // must use dynamic shared memory (rather than statically sized
+  // arrays) and require an explicit opt-in using cudaFuncSetAttribute()".
+
+  TORCH_CHECK(smem_bytes > 0);
+
+  int max_smem_bytes = 0;
+  C10_CUDA_CHECK(cudaDeviceGetAttribute(
+      &max_smem_bytes,
+#ifndef __HIP_PLATFORM_AMD__
+      cudaDevAttrMaxSharedMemoryPerBlockOptin,
+#else
+      hipDeviceAttributeMaxSharedMemoryPerBlock,
+#endif
+      device));
+
+  TORCH_CHECK(
+      smem_bytes <= max_smem_bytes,
+      "Attempted to allocate ",
+      smem_bytes / 1024,
+      " KB of shared memory but only ",
+      max_smem_bytes / 1024,
+      " KB is available");
+
+  C10_CUDA_CHECK(cudaFuncSetAttribute(
+      reinterpret_cast<void*>(kernel),
+      cudaFuncAttributeMaxDynamicSharedMemorySize,
+      // V100: 64 KB; A100: 96 KB; H100: 144 KB
+      smem_bytes));
+
+#endif
+}
+
+} // namespace fbgemm_gpu::utils::cuda
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/device_properties.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/device_properties.cuh
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/common.cuh b/fbgemm_gpu/src/jagged_tensor_ops/common.cuh
@@ -28,6 +28,7 @@
 #include "fbgemm_gpu/sparse_ops.h"
 #include "fbgemm_gpu/utils/binary_search_range.cuh"
 #include "fbgemm_gpu/utils/cuda_block_count.h"
+#include "fbgemm_gpu/utils/cuda_utilities.cuh"
 #include "fbgemm_gpu/utils/dispatch_macros.h"
 #include "fbgemm_gpu/utils/fixed_divisor.cuh"
 #include "fbgemm_gpu/utils/inclusive_sum_scan.cuh"
@@ -834,14 +835,12 @@ void jagged_dense_elementwise_jagged_output_opt_(
             int used_shared_kb = shared_kb;
 #endif
             int used_shared_bytes = used_shared_kb << 10;
-#ifndef USE_ROCM
-            C10_CUDA_CHECK(cudaFuncSetAttribute(
+
+            utils::cuda::set_max_dynamic_smem(
                 jagged_dense_dense_elementwise_jagged_output_opt_search_kernel_<
                     index_t>,
-                cudaFuncAttributeMaxDynamicSharedMemorySize,
-                used_shared_bytes)); // V100: 64 KB; A100: 96 KB; H100: 144 KB
-#endif
-            C10_CUDA_KERNEL_LAUNCH_CHECK();
+                used_shared_bytes);
+
             TORCH_CHECK(dynamic_smem_size <= used_shared_bytes);
           }
 
diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu
@@ -124,18 +124,16 @@ void jagged_dense_dense_elementwise_jagged_output_opt_(
             int used_shared_kb = shared_kb;
 #endif
             int used_shared_bytes = used_shared_kb << 10;
-#ifndef USE_ROCM
-            C10_CUDA_CHECK(cudaFuncSetAttribute(
+            TORCH_CHECK_LE(dynamic_smem_size, used_shared_bytes);
+
+            utils::cuda::set_max_dynamic_smem(
                 jagged_dense_dense_elementwise_jagged_output_opt_search_kernel_<
                     index_t>,
-                cudaFuncAttributeMaxDynamicSharedMemorySize,
-                used_shared_bytes)); // V100: 64 KB; A100: 96 KB.
-#endif
-            C10_CUDA_KERNEL_LAUNCH_CHECK();
-            TORCH_CHECK_LE(dynamic_smem_size, used_shared_bytes);
+                used_shared_bytes);
           }
-          dim3 threads_bs = dim3(1024, 1, 1);
-          dim3 blocks_bs = dim3(div_round_up(nnz, threads_bs.x), 1, 1);
+
+          const auto threads_bs = dim3(1024, 1, 1);
+          const auto blocks_bs = dim3(div_round_up(nnz, threads_bs.x), 1, 1);
 
 #ifdef FBGEMM_GPU_MEMCHECK
           const auto func_name1 =
diff --git a/fbgemm_gpu/src/quantize_ops/quantize_mx.cu b/fbgemm_gpu/src/quantize_ops/quantize_mx.cu
@@ -15,6 +15,7 @@
 #include <torch/types.h>
 
 #include "c10/core/ScalarType.h"
+#include "fbgemm_gpu/utils/cuda_utilities.cuh"
 #include "fbgemm_gpu/utils/ops_utils.h"
 #include "fbgemm_gpu/utils/tensor_utils.h"
 
@@ -81,21 +82,8 @@ int32_t compute_num_groups_and_dynamic_smem_bytes(
   }
   TORCH_CHECK_GE(*num_groups_per_block, 1);
 
-  // Check
-  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-  // "Compute capability 7.x devices allow a single thread block to
-  // address the full capacity of shared memory: 96 KB on Volta,
-  // 64 KB on Turing. Kernels relying on shared memory allocations
-  // over 48 KB per block are architecture-specific, as such they
-  // must use dynamic shared memory (rather than statically sized
-  // arrays) and require an explicit opt-in using cudaFuncSetAttribute()".
-#ifndef USE_ROCM
-  cudaFuncSetAttribute(
-      kernel_func_name,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      used_shared_bytes); // V100: 64 KB; A100: 96 KB; H100: 144 KB
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-#endif
+  utils::cuda::set_max_dynamic_smem(kernel_func_name, used_shared_bytes);
+
   return smem_bytes;
 }
 
diff --git a/fbgemm_gpu/src/sparse_ops/common.cuh b/fbgemm_gpu/src/sparse_ops/common.cuh
@@ -9,6 +9,7 @@
 #include "fbgemm_gpu/sparse_ops.cuh"
 #include "fbgemm_gpu/sparse_ops.h"
 #include "fbgemm_gpu/utils/cuda_block_count.h"
+#include "fbgemm_gpu/utils/cuda_utilities.cuh"
 #include "fbgemm_gpu/utils/ops_utils.h"
 
 #include <ATen/ATen.h>
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
@@ -71,14 +71,6 @@ void adjust_block_bucketize_sparse_features_kernel_launch_configs_based_on_smem(
   grid_dims->x = cuda_calc_xblock_count(lengths_size, block_dims->y);
 }
 
-template <typename func_t>
-void increase_gpu_max_dynamic_shared_memory(func_t kernel, const int max_smem) {
-  TORCH_CHECK(max_smem > 0);
-  C10_CUDA_CHECK(cudaFuncSetAttribute(
-      (void*)kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_smem));
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-}
-
 // Kernel for bucketize lengths, with the Block distribution (vs. cyclic,
 // block-cyclic distribution). Used for bucketize sparse feature, especially for
 // checkpointing with row-wise partition (sparse_feature is partitioned
@@ -562,7 +554,7 @@ __launch_bounds__(kMaxThreads) void _populate_bucketized_permute_cuda_kernel(
                             index_t,                                             \
                             scalar_t>;                                           \
                     if (smem_size > smem_adjust_threshold) {                     \
-                      increase_gpu_max_dynamic_shared_memory(                    \
+                      utils::cuda::set_max_dynamic_smem(                         \
                           block_bucketize_kernel, max_smem);                     \
                     }                                                            \
                     block_bucketize_kernel<<<                                    \
@@ -625,7 +617,7 @@ __launch_bounds__(kMaxThreads) void _populate_bucketized_permute_cuda_kernel(
                       index_t,                                                      \
                       std::nullptr_t>;                                              \
               if (smem_size > smem_adjust_threshold) {                              \
-                increase_gpu_max_dynamic_shared_memory(                             \
+                utils::cuda::set_max_dynamic_smem(                                  \
                     block_bucketize_kernel, max_smem);                              \
               }                                                                     \
               block_bucketize_kernel<<<                                             \