Add symmetric quantization with no clipping error in the tensor subclass based API (#845)

iseeyuan · Martin Yuan · web-flow · commit c842d50c0041 · 2024-09-09T16:15:10.000-07:00
Co-authored-by: Martin Yuan &lt;myuan@meta.com&gt;
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -56,6 +56,7 @@
 import tempfile
 import gc
 from torch.testing._internal.common_utils import TestCase
+from torch.testing._internal import common_utils
 
 
 def dynamic_quant(model, example_inputs):
@@ -500,12 +501,13 @@ def test_eval_wrapper_llama3(self):
 
     # TODO: move to a separate test file
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
-    def test_quantized_tensor_subclass_8da4w(self):
+    @common_utils.parametrize("mapping_type", [MappingType.SYMMETRIC, MappingType.SYMMETRIC_NO_CLIPPING_ERR])
+    def test_quantized_tensor_subclass_8da4w(self, mapping_type):
         group_size = 32
         m = ToyLinearModel().eval()
         m_copy = copy.deepcopy(m)
         example_inputs = m.example_inputs()
-        quantize_(m, int8_dynamic_activation_int4_weight(group_size=group_size))
+        quantize_(m, int8_dynamic_activation_int4_weight(group_size=group_size, mapping_type=mapping_type))
 
         assert isinstance(m.linear1.weight, LinearActivationQuantizedTensor)
         assert isinstance(m.linear2.weight, LinearActivationQuantizedTensor)
@@ -516,7 +518,7 @@ def test_quantized_tensor_subclass_8da4w(self):
         from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
         from torchao.quantization.GPTQ import Int8DynActInt4WeightLinear
 
-        quantizer = Int8DynActInt4WeightQuantizer(groupsize=group_size)
+        quantizer = Int8DynActInt4WeightQuantizer(groupsize=group_size, mapping_type=mapping_type)
         m_copy = quantizer.quantize(m_copy)
         assert isinstance(m_copy.linear1, Int8DynActInt4WeightLinear)
         assert isinstance(m_copy.linear2, Int8DynActInt4WeightLinear)
@@ -704,6 +706,8 @@ def reset_memory():
             assert param.is_cuda
         self.assertLess(memory_streaming, memory_baseline)
 
+common_utils.instantiate_parametrized_tests(TestQuantFlow)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -474,14 +474,13 @@ def _int8_asymm_per_token_quant(x: torch.Tensor) -> torch.Tensor:
     target_dtype = torch.int8
     return to_affine_quantized_intx(x, mapping_type, _get_per_token_block_size(x), target_dtype)
 
-def apply_int8_dynamic_activation_int4_weight_quant(weight, group_size=32):
+def apply_int8_dynamic_activation_int4_weight_quant(weight, group_size=32, mapping_type=MappingType.SYMMETRIC):
     """This is defined here instead of local function to support serialization
     """
     if weight.shape[-1] % group_size != 0:
         return weight
 
     # weight settings
-    mapping_type = MappingType.SYMMETRIC
     block_size = (1, group_size)
     target_dtype = torch.int8
     eps = torch.finfo(torch.float32).eps
@@ -495,7 +494,7 @@ def apply_int8_dynamic_activation_int4_weight_quant(weight, group_size=32):
     weight = to_linear_activation_quantized(weight, input_quant_func)
     return weight
 
-def int8_dynamic_activation_int4_weight(group_size=32):
+def int8_dynamic_activation_int4_weight(group_size=32, mapping_type=MappingType.SYMMETRIC):
     """Applies int8 dynamic per token asymmetric activation quantization and int4 per group weight symmetric quantization to linear
     This is used to produce a model for executorch backend, but currently executorch did not
     support lowering for the quantized model from this flow yet
@@ -504,7 +503,7 @@ def int8_dynamic_activation_int4_weight(group_size=32):
         `group_size`: parameter for quantization, controls the granularity of quantization, smaller
          size is more fine grained
     """
-    return _get_linear_subclass_inserter(apply_int8_dynamic_activation_int4_weight_quant, group_size=group_size)
+    return _get_linear_subclass_inserter(apply_int8_dynamic_activation_int4_weight_quant, group_size=group_size, mapping_type=mapping_type)
 
 
 def int4_weight_only(group_size=128, layout_type=TensorCoreTiledLayoutType(inner_k_tiles=8), use_hqq=False):