[BugFix] Fix: ImportError when building on hopper systems (vllm-project#20513)

LucasWilkinson · web-flow · commit 40b86aa05e44 · 2025-07-06T12:17:30.000+08:00
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -16,7 +16,7 @@
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm
 /vllm/entrypoints @aarnphm
-CMakeLists.txt @tlrmchlsmth
+CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
diff --git a/csrc/ops.h b/csrc/ops.h
@@ -239,11 +239,6 @@ void cutlass_moe_mm(
     torch::Tensor const& b_strides, torch::Tensor const& c_strides,
     bool per_act_token, bool per_out_ch);
 
-void cutlass_blockwise_scaled_grouped_mm(
-    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
-    const torch::Tensor& scales_a, const torch::Tensor& scales_b,
-    const torch::Tensor& problem_sizes, const torch::Tensor& expert_offsets);
-
 void cutlass_fp4_group_mm(
     torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
     const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
diff --git a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu
@@ -1,3 +1,5 @@
+#include "core/registration.h"
+
 #include <torch/all.h>
 #include <cutlass/arch/arch.h>
 
@@ -364,4 +366,9 @@ void cutlass_blockwise_scaled_grouped_mm(
     TORCH_CHECK(false, "Unsupported output tensor type");
   }
 #endif
-}
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("cutlass_blockwise_scaled_grouped_mm",
+         &cutlass_blockwise_scaled_grouped_mm);
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -399,8 +399,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor scales_a, Tensor scales_b, "
       "Tensor problem_sizes, Tensor expert_offsets) -> ()",
       {stride_tag});
-  ops.impl("cutlass_blockwise_scaled_grouped_mm", torch::kCUDA,
-           &cutlass_blockwise_scaled_grouped_mm);
+  // conditionally compiled so impl registration is in source file
 
   // cutlass nvfp4 block scaled group GEMM
   ops.def(