is_blackwell_deep_gemm_used must _lazy_init first

smarterclayton · smarterclayton · commit e59b90e43de3 · 2025-07-24T12:22:34.000-04:00
DeepGEMM on blackwell was being ignored because _lazy_init had
not been invoked yet. Update the function to require _lazy_init
no matter what caller.

As a consequence of this change, CUDA init was failing, likely
due to the fork/spawn early during initializition. Since
this is generic code, switch from invoking
cuda_get_device_properties to a platform check.

Signed-off-by: Clayton Coleman &lt;smarterclayton@gmail.com&gt;
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
@@ -13,20 +13,23 @@
 import torch
 
 import vllm.envs as envs
-from vllm.utils import cuda_get_device_properties, has_deep_gemm
+from vllm.platforms import current_platform
+from vllm.utils import has_deep_gemm
 
 
 @functools.cache
 def is_blackwell_deep_gemm_used() -> bool:
     """Return ``True`` if vLLM is configured to use DeepGEMM on a
     Blackwell-class GPU.
     """
+    if not (envs.VLLM_USE_DEEP_GEMM and has_deep_gemm()):
+        return False
 
-    if not (envs.VLLM_USE_DEEP_GEMM and has_deep_gemm()
-            and _per_block_cast_impl is not None):
+    _lazy_init()
+    if _per_block_cast_impl is None:
         return False
 
-    return cuda_get_device_properties(0, ("major", ))[0] == 10
+    return current_platform.is_cuda() and current_platform.is_device_capability(100)
 
 
 def _missing(*_: Any, **__: Any) -> NoReturn:
@@ -64,7 +67,7 @@ def _lazy_init() -> None:
 
     if not has_deep_gemm():
         return
-
+    
     _dg = importlib.import_module("deep_gemm")
 
     _fp8_gemm_nt_impl = _resolve_symbol(_dg, "fp8_gemm_nt",