silu_mul_quant fix (#4395)

sunfish2010 · facebook-github-bot · commit 04326157a2ef · 2025-06-24T13:57:43.000-07:00
Summary: Pull Request resolved: #4395 X-link: facebookresearch/FBGEMM#1466 Avoid division by 0 when T == 0 Reviewed By: jianyuh Differential Revision: D77236510 fbshipit-source-id: f8943125b358443bea9b05e136875f5c93822b26
diff --git a/fbgemm_gpu/experimental/gen_ai/gen_ai/moe/activation.py b/fbgemm_gpu/experimental/gen_ai/gen_ai/moe/activation.py
@@ -104,9 +104,12 @@ def silu_mul_quant(
 
     out = torch.empty((T, D), device="cuda", dtype=pt_dtype)
     out_inv_scale = torch.empty((T,), device="cuda", dtype=torch.float32)
+    if T == 0:
+        return out, out_inv_scale
 
     NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
     BLOCK_T = triton.cdiv(T, NUM_SMS)
+
     NUM_CTAS = triton.cdiv(T, BLOCK_T)
 
     grid = (NUM_CTAS,)
diff --git a/fbgemm_gpu/experimental/gen_ai/test/moe/activation_test.py b/fbgemm_gpu/experimental/gen_ai/test/moe/activation_test.py
@@ -36,7 +36,7 @@ class ActivationTests(unittest.TestCase):
     """Test activation kernels."""
 
     @given(
-        T=st.sampled_from([1, 128, 2048, 4096, 16384]),
+        T=st.sampled_from([0, 1, 128, 2048, 4096, 16384]),
         D=st.sampled_from([5120, 7168]),
         contiguous=st.sampled_from([True, False]),
         partial=st.sampled_from([True, False]),
@@ -94,7 +94,7 @@ def ref_fn() -> torch.Tensor:
         "Skip when H100 is not available",
     )
     @given(
-        T=st.sampled_from([1, 128, 2048, 4096, 16384]),
+        T=st.sampled_from([0, 1, 128, 2048, 4096, 16384]),
         D=st.sampled_from([5120, 7168]),
         scale_ub=st.sampled_from([None, 1200.00]),
         contiguous=st.sampled_from([True, False]),