update API and remove branching on quant_api.py transform functions

HDCharles · HDCharles · commit c64118c17614 · 2025-05-08T11:07:18.000-07:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/quantization/test_moe_quant.py b/test/quantization/test_moe_quant.py
@@ -12,6 +12,7 @@
 from torchao.quantization.prototype.moe_quant.utils import (
     FakeExtraDimTensor,
     MoEQuantConfig,
+    UseFakeExtraDimTensor,
     cond_ffn_filter,
 )
 from torchao.quantization.quant_api import (
@@ -25,7 +26,11 @@
     quantize_,
 )
 from torchao.quantization.utils import compute_error
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, is_sm_at_least_90, TORCH_VERSION_AT_LEAST_2_6
+from torchao.utils import (
+    TORCH_VERSION_AT_LEAST_2_5,
+    TORCH_VERSION_AT_LEAST_2_6,
+    is_sm_at_least_90,
+)
 
 
 class TestMoEQuantCompile(unittest.TestCase):
@@ -61,7 +66,10 @@ def _test_impl_moe_quant(
 
         quantize_(model, config, cond_ffn_filter)
 
-        if isinstance(config, MoEQuantConfig):
+        if (
+            isinstance(config, MoEQuantConfig)
+            and config.use_fake_extra_dim_tensor == UseFakeExtraDimTensor.TRUE
+        ):
             self.assertIsInstance(model.experts.w1, FakeExtraDimTensor)
             if base_class is not None:
                 self.assertIsInstance(model.experts.w1.head_tensor, base_class)
@@ -104,7 +112,9 @@ def test_int4wo_fake_dim(self, name, num_tokens, fullgraph):
         if not TORCH_VERSION_AT_LEAST_2_5:
             self.skipTest("Test only enabled for 2.5+")
 
-        config = MoEQuantConfig(Int4WeightOnlyConfig())
+        config = MoEQuantConfig(
+            Int4WeightOnlyConfig(), use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE
+        )
         tensor_impl_class = TensorCoreTiledAQTTensorImpl
 
         self._test_impl_moe_quant(
@@ -128,7 +138,7 @@ def test_int4wo_base(self, name, num_tokens, fullgraph):
         if not TORCH_VERSION_AT_LEAST_2_5:
             self.skipTest("Test only enabled for 2.5+")
 
-        config = Int4WeightOnlyConfig()
+        config = MoEQuantConfig(Int4WeightOnlyConfig())
         tensor_impl_class = TensorCoreTiledAQTTensorImpl
 
         self._test_impl_moe_quant(
@@ -150,7 +160,9 @@ def test_int8wo_fake_dim(self, name, num_tokens, fullgraph):
         if not TORCH_VERSION_AT_LEAST_2_5:
             self.skipTest("Test only enabled for 2.5+")
 
-        config = MoEQuantConfig(Int8WeightOnlyConfig())
+        config = MoEQuantConfig(
+            Int8WeightOnlyConfig(), use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE
+        )
         tensor_impl_class = PlainAQTTensorImpl
 
         self._test_impl_moe_quant(
@@ -172,7 +184,7 @@ def test_int8wo_base(self, name, num_tokens, fullgraph):
         if not TORCH_VERSION_AT_LEAST_2_6:
             self.skipTest("Test only enabled for 2.6+")
 
-        config = Int8WeightOnlyConfig()
+        config = MoEQuantConfig(Int8WeightOnlyConfig())
         tensor_impl_class = PlainAQTTensorImpl
 
         self._test_impl_moe_quant(
@@ -192,7 +204,7 @@ def test_int8wo_base_cpu(self, name, num_tokens, fullgraph):
         if not TORCH_VERSION_AT_LEAST_2_6:
             self.skipTest("Test only enabled for 2.6+")
 
-        config = Int8WeightOnlyConfig()
+        config = MoEQuantConfig(Int8WeightOnlyConfig())
         tensor_impl_class = PlainAQTTensorImpl
 
         self._test_impl_moe_quant(
@@ -214,7 +226,10 @@ def test_int8dq_fake_dim(self, name, num_tokens, fullgraph):
         if not TORCH_VERSION_AT_LEAST_2_5:
             self.skipTest("Test only enabled for 2.5+")
 
-        config = MoEQuantConfig(Int8DynamicActivationInt8WeightConfig())
+        config = MoEQuantConfig(
+            Int8DynamicActivationInt8WeightConfig(),
+            use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE,
+        )
         base_class = LinearActivationQuantizedTensor
 
         self._test_impl_moe_quant(
@@ -236,7 +251,7 @@ def test_int8dq_base(self, name, num_tokens, fullgraph):
         if not TORCH_VERSION_AT_LEAST_2_5:
             self.skipTest("Test only enabled for 2.5+")
 
-        config = Int8DynamicActivationInt8WeightConfig()
+        config = MoEQuantConfig(Int8DynamicActivationInt8WeightConfig())
         base_class = LinearActivationQuantizedTensor
 
         self._test_impl_moe_quant(
@@ -259,7 +274,10 @@ def test_fp8wo_fake_dim(self, name, num_tokens, fullgraph):
         if not is_sm_at_least_90():
             self.skipTest("Requires CUDA capability >= 9.0")
 
-        config = MoEQuantConfig(Float8WeightOnlyConfig())
+        config = MoEQuantConfig(
+            Float8WeightOnlyConfig(),
+            use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE,
+        )
         tensor_impl_class = Float8AQTTensorImpl
 
         self._test_impl_moe_quant(
@@ -281,7 +299,7 @@ def test_fp8wo_base(self, name, num_tokens, fullgraph):
         if not is_sm_at_least_90():
             self.skipTest("Requires CUDA capability >= 9.0")
 
-        config = Float8WeightOnlyConfig()
+        config = MoEQuantConfig(Float8WeightOnlyConfig())
         tensor_impl_class = Float8AQTTensorImpl
 
         self._test_impl_moe_quant(
@@ -303,7 +321,10 @@ def test_fp8dq_fake_dim(self, name, num_tokens, fullgraph):
         if not is_sm_at_least_90():
             self.skipTest("Requires CUDA capability >= 9.0")
 
-        config = MoEQuantConfig(Float8DynamicActivationFloat8WeightConfig())
+        config = MoEQuantConfig(
+            Float8DynamicActivationFloat8WeightConfig(),
+            use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE,
+        )
         base_class = LinearActivationQuantizedTensor
 
         self._test_impl_moe_quant(
@@ -325,7 +346,7 @@ def test_fp8dq_base(self, name, num_tokens, fullgraph):
         if not is_sm_at_least_90():
             self.skipTest("Requires CUDA capability >= 9.0")
 
-        config = Float8DynamicActivationFloat8WeightConfig()
+        config = MoEQuantConfig(Float8DynamicActivationFloat8WeightConfig())
         base_class = LinearActivationQuantizedTensor
 
         self._test_impl_moe_quant(
diff --git a/torchao/_models/mixtral-moe/generate.py b/torchao/_models/mixtral-moe/generate.py
@@ -239,6 +239,7 @@ def main(
     from torchao.quantization.prototype.moe_quant.utils import (
         MoEQuantConfig,
         cond_ffn_filter,
+        UseFakeExtraDimTensor
     )
     from torchao.quantization.quant_api import (
         Float8DynamicActivationFloat8WeightConfig,
@@ -256,40 +257,44 @@ def main(
         torch._dynamo.config.capture_dynamic_output_shape_ops = True
         config = None
         if "int8wo-base" in moe_quant:
-            config = Int8WeightOnlyConfig()
+            config = MoEQuantConfig(Int8WeightOnlyConfig())
 
         elif "int8wo" in moe_quant:
-            config = MoEQuantConfig(Int8WeightOnlyConfig())
+            config = MoEQuantConfig(Int8WeightOnlyConfig(), use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE)
 
         elif "int8dq-base" in moe_quant:
-            config = Int8DynamicActivationInt8WeightConfig()
+            config = MoEQuantConfig(Int8DynamicActivationInt8WeightConfig())
 
         elif "int8dq" in moe_quant:
-            config = MoEQuantConfig(Int8DynamicActivationInt8WeightConfig())
+            config = MoEQuantConfig(Int8DynamicActivationInt8WeightConfig(), use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE)
 
         elif "int4wo-base" in moe_quant:
-            config = Int4WeightOnlyConfig()
+            config = MoEQuantConfig(Int4WeightOnlyConfig())
 
         elif "int4wo" in moe_quant:
-            config = MoEQuantConfig(Int4WeightOnlyConfig())
+            config = MoEQuantConfig(Int4WeightOnlyConfig(), use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE)
 
         elif "fp8wo-base" in moe_quant:
-            config = Float8WeightOnlyConfig()
+            config = MoEQuantConfig(Float8WeightOnlyConfig())
 
         elif "fp8wo" in moe_quant:
-            config = MoEQuantConfig(Float8WeightOnlyConfig())
+            config = MoEQuantConfig(Float8WeightOnlyConfig(), use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE)
 
         elif "fp8dq-base" in moe_quant:
-            config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+            config = MoEQuantConfig(Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()))
 
         elif "fp8dq" in moe_quant:
             config = MoEQuantConfig(
-                Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+                Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()),
+                use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE,
             )
 
         elif "intxdq" in moe_quant:
-            config = Int8DynamicActivationIntxWeightConfig(
-                layout=PackedLinearInt8DynamicActivationIntxWeightLayout()
+            config = MoEQuantConfig(
+                Int8DynamicActivationIntxWeightConfig(
+                    layout=PackedLinearInt8DynamicActivationIntxWeightLayout(),
+                ),
+                use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE,
             )
         else:
             assert config is not None, (
diff --git a/torchao/dtypes/affine_quantized_tensor_ops.py b/torchao/dtypes/affine_quantized_tensor_ops.py
@@ -504,7 +504,6 @@ def _(func, types, args, kwargs):
     assert len(indices) == 1, (
         f"op {func} currently only implemented for single dimensional indexing but got indices: {indices}"
     )
-
     new_tensor_impl = aten.index.Tensor(self.tensor_impl, indices)
     shape = tuple([indices[0].numel(), *self.shape[1:]])
 
diff --git a/torchao/dtypes/floatx/float8_layout.py b/torchao/dtypes/floatx/float8_layout.py
@@ -55,6 +55,7 @@ class Float8Layout(Layout):
 
     mm_config: Optional[Float8MMConfig] = None
 
+_fallback_warning_shown = False
 
 @register_layout(Float8Layout)
 class Float8AQTTensorImpl(AQTTensorImpl):
@@ -100,12 +101,34 @@ def __init__(
 
     def _apply_fn_to_data(self, fn):
         """Applys a fn to all tensor components stored on this class"""
-        return self.__class__(
-            fn(self.float8_data),
-            fn(self.scale),
-            self.transposed,
-            self._layout,
-        )
+        global _fallback_warning_shown
+        
+        try:
+            return self.__class__(
+                fn(self.float8_data),
+                fn(self.scale),
+                self.transposed,
+                self._layout,
+            )
+        except RuntimeError as e:
+            if '"index_cuda" not implemented for ' in str(e):
+                if not _fallback_warning_shown:
+                    import warnings
+                    warnings.warn(
+                        f"When trying to index Float8AQTTensorImpl, got known error {e}, will use slower fallback but "
+                        + "note: You can torch.compile the model to avoid this problem.",
+                        UserWarning
+                    )
+                    _fallback_warning_shown = True
+                    
+                return self.__class__( # do indexing in bfloat16 then convert back
+                    fn(self.float8_data.to(torch.bfloat16)).to(self.float8_data.dtype),
+                    fn(self.scale),
+                    self.transposed,
+                    self._layout,
+                )
+            else:
+                raise e
 
     def to(self, *args, **kwargs):
         kwargs = self._get_to_kwargs(*args, **kwargs)
diff --git a/torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py b/torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py
@@ -637,6 +637,7 @@ def test_moe_quant_intx(self):
             FakeExtraDimTensor,
             MoEQuantConfig,
             cond_ffn_filter,
+            UseFakeExtraDimTensor,
         )
         from torchao.quantization.quant_api import (
             Int8DynamicActivationIntxWeightConfig,
@@ -656,7 +657,7 @@ def test_moe_quant_intx(self):
         base_config = Int8DynamicActivationIntxWeightConfig(
             layout=PackedLinearInt8DynamicActivationIntxWeightLayout()
         )
-        moe_config = MoEQuantConfig(base_config)
+        moe_config = MoEQuantConfig(base_config, use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE)
 
         quantize_(model, moe_config, cond_ffn_filter)
 
diff --git a/torchao/quantization/prototype/moe_quant/README.md b/torchao/quantization/prototype/moe_quant/README.md
@@ -10,10 +10,10 @@ The API for moe quantization is very similar to linear quantization, given a moe
 
 ```python
 
-from torchao.quantization.prototype.moe_quant.utils import cond_ffn_filter
+from torchao.quantization.prototype.moe_quant.utils import cond_ffn_filter,
 from torchao.quantization.quant_api import quantize_, Int8WeightOnlyConfig
 
-quantize_(model, Int8WeightOnlyConfig(), filter_fn=cond_ffn_filter)
+quantize_(model, MoEQuantConfig(Int8WeightOnlyConfig()), filter_fn=cond_ffn_filter)
 model=torch.compile(model, mode="reduce-overhead")
 # you can also use fullgraph=True for single token inference
 ```
@@ -23,20 +23,26 @@ This api is the same as for normal linear quantization but with a specific filte
 
 ## Alternative Quantization API
 
-To make the above api work, each tensor subclass had to be edited to work as 3D tensors. However the only ops we actually need to support are a few indexing and slicing ops on the 0th dimension, the majority of the work was removing hard coded assumptions about the tensor dimensionality. This means its possible to instead create a new tensor subclass that pretends to be a 3D tensor by storing a series of 2D tensors and simulating the slicing and indexing ops until eventually just returning the singular desired 2D quantized tensor subclass. This can be achieved using the alternative api as follows:
+To make the above api work, each tensor subclass had to be edited to work as 3D tensors. However the only ops we actually need to support are a few indexing and slicing ops on the 0th dimension, the majority of the work was removing hard coded assumptions about the tensor dimensionality. This means its possible to instead create a new tensor subclass that pretends to be a 3D tensor by storing a series of 2D tensors and simulating the slicing and indexing ops until eventually just returning the singular desired 2D quantized tensor subclass. This can be achieved using the alternative api by changing the fake_extra_dim_tensor flag of the MoEQuantConfig:
 
 ```python
 
-from torchao.quantization.prototype.moe_quant.utils import cond_ffn_filter, MoEQuantConfig
+from torchao.quantization.prototype.moe_quant.utils import cond_ffn_filter, MoEQuantConfig, UseFakeExtraDimTensor
 from torchao.quantization.quant_api import quantize_, Int8DynamicActivationIntxWeightConfig
 
-config = MoEQuantConfig(Int8DynamicActivationIntxWeightConfig())
+config = MoEQuantConfig(
+    Int8DynamicActivationIntxWeightConfig(),
+    # this is the only difference from the above api
+    use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE,
+)
 
 quantize_(model, , filter_fn=cond_ffn_filter)
 model=torch.compile(model, mode="reduce-overhead")
 ```
 
-While this approach turns out to not be especially performant, it does allow for comparable memory characteristics, allowing models that wouldn't fit on a single node/gpu to actually run. It is flexible enough however to work with all of the existing linear quantization techniques that make use of quantized tensor subclasses without any changes being made to those classes. It is compilable though even single token inference doesn't work with fullgraph compilation.
+It should also be noted that the default value for use_fake_extra_dim_tensor is AS_FALLBACK which means that it will try to use the base method but if not, will use the more general but less performant fake_extra_dim_tensor method.
+
+While this approach turns out to not be especially performant, it does allow for slightly better memory characteristics since all the tensors are held seperately and aren't actually modified or indexed. It is flexible enough to work with all of the existing linear quantization techniques that make use of quantized tensor subclasses without any changes being made to those classes. It is compilable though neither single token nor multi token inference works with fullgraph compilation.
 
 ## Model API
 
diff --git a/torchao/quantization/prototype/moe_quant/llama4_quant.py b/torchao/quantization/prototype/moe_quant/llama4_quant.py
@@ -70,9 +70,9 @@ def convert_fn(module):
 model = model
 
 from torchao.quantization import Int4WeightOnlyConfig, quantize_
-from torchao.quantization.prototype.moe_quant.utils import cond_ffn_filter
+from torchao.quantization.prototype.moe_quant.utils import cond_ffn_filter, MoEQuantConfig
 
-quantize_(model, Int4WeightOnlyConfig(), cond_ffn_filter, device="cuda")
+quantize_(model, MoEQuantConfig(Int4WeightOnlyConfig()), cond_ffn_filter, device="cuda")
 
 model.cuda()
 
diff --git a/torchao/quantization/prototype/moe_quant/utils.py b/torchao/quantization/prototype/moe_quant/utils.py
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py

Original file line number	Diff line number	Diff line change
`@@ -504,7 +504,6 @@ def _(func, types, args, kwargs):`
`504`	`504`	`assert len(indices) == 1, (`
`505`	`505`	`f"op {func} currently only implemented for single dimensional indexing but got indices: {indices}"`
`506`	`506`	`)`
`507`		`-`
`508`	`507`	`new_tensor_impl = aten.index.Tensor(self.tensor_impl, indices)`
`509`	`508`	`shape = tuple([indices[0].numel(), *self.shape[1:]])`
`510`	`509`
Original file line number	Diff line number	Diff line change
`@@ -637,6 +637,7 @@ def test_moe_quant_intx(self):`
`637`	`637`	`FakeExtraDimTensor,`
`638`	`638`	`MoEQuantConfig,`
`639`	`639`	`cond_ffn_filter,`
	`640`	`+ UseFakeExtraDimTensor,`
`640`	`641`	`)`
`641`	`642`	`from torchao.quantization.quant_api import (`
`642`	`643`	`Int8DynamicActivationIntxWeightConfig,`
`@@ -656,7 +657,7 @@ def test_moe_quant_intx(self):`
`656`	`657`	`base_config = Int8DynamicActivationIntxWeightConfig(`
`657`	`658`	`layout=PackedLinearInt8DynamicActivationIntxWeightLayout()`
`658`	`659`	`)`
`659`		`- moe_config = MoEQuantConfig(base_config)`
	`660`	`+ moe_config = MoEQuantConfig(base_config, use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE)`
`660`	`661`
`661`	`662`	`quantize_(model, moe_config, cond_ffn_filter)`
`662`	`663`