Relax MOE constraints and add test for torch.mm computation (#2227)

jerryzh168 · web-flow · commit c4a7ad46f821 · 2025-05-21T19:09:24.000-07:00
* Relax some constraints to allow quantizing aten.mm

Summary:
Currently both float8 dynamic quant and int4 weight only quant only works with F.linear, not aten.mm
this PR allows fallback to dequantizing tensors and run the fallback path before the real support is
in place.

Test Plan:
python test/dtypes/test_affine_quantized.py -k test_mm_int4wo
python test/dtypes/test_affine_quantized_float.py -k test_mm_float8dq

Reviewers:

Subscribers:

Tasks:

Tags:

* add skip if no cuda

* update tests

* update
diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -421,6 +421,24 @@ def test_slice_and_copy_int4wo(self, device, dtype):
         # making sure param.data is updated
         assert param.data.dequantize()[0][0] != 0
 
+    @common_utils.parametrize("device", ["cuda"])
+    @common_utils.parametrize("dtype", [torch.bfloat16])
+    @skip_if_no_cuda()
+    @skip_if_rocm("ROCm enablement in progress")
+    def test_mm_int4wo(self, device, dtype):
+        weight = torch.randn(512, 1024).to(device).to(dtype)
+        weight = weight.t()
+
+        l = torch.nn.Linear(512, 1024).to(device).to(dtype)
+        l.weight = torch.nn.Parameter(weight)
+        quantize_(l, Int4WeightOnlyConfig())
+        # weight shape: 1024 x 512
+        weight = l.weight
+
+        input = torch.randn(1, 512, device=device, dtype=dtype)
+        # make sure it runs
+        torch.nn.functional.linear(input, weight)
+
 
 common_utils.instantiate_parametrized_tests(TestAffineQuantized)
 common_utils.instantiate_parametrized_tests(TestAffineQuantizedBasic)
diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
@@ -27,6 +27,7 @@
 
 from torchao.float8.float8_utils import compute_error
 from torchao.quantization import (
+    Float8DynamicActivationFloat8WeightConfig,
     float8_dynamic_activation_float8_weight,
     float8_weight_only,
     quantize_,
@@ -292,6 +293,26 @@ def test_fp8_weight_dimension_warning(self):
                 f"Expected warning message containing: {expected}",
             )
 
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(
+        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+    )
+    def test_mm_float8dq(self):
+        device = "cuda"
+        dtype = torch.bfloat16
+        weight = torch.randn(512, 1024).to(device).to(dtype)
+        weight = weight.t()
+
+        l = torch.nn.Linear(512, 1024).to(device).to(dtype)
+        l.weight = torch.nn.Parameter(weight)
+        quantize_(l, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()))
+        # weight shape: 1024 x 512
+        weight = l.weight
+
+        input = torch.randn(1, 512, device=device, dtype=dtype)
+        # make sure it runs
+        torch.nn.functional.linear(input, weight)
+
 
 common_utils.instantiate_parametrized_tests(TestAffineQuantizedFloat8Compile)
 
diff --git a/torchao/prototype/moe_quant/utils.py b/torchao/prototype/moe_quant/utils.py
@@ -1,3 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
 import torch
 from torch.utils._python_dispatch import (
     return_and_correct_aliasing,
@@ -282,7 +288,6 @@ def moe_quant_fn(module, config: MoEQuantConfig):
 
     warnings.simplefilter("ignore", lineno=84)
     warnings.simplefilter("ignore", lineno=105)
-    assert "ConditionalFeedForwardAOQuantizable" in str(type(module))
 
     for weight_attr in ["w1", "w2", "w3"]:
         param = getattr(module, weight_attr)