Skip to content

Commit c842d50

Browse files
iseeyuanMartin Yuan
andauthored
Add symmetric quantization with no clipping error in the tensor subclass based API (#845)
Co-authored-by: Martin Yuan <myuan@meta.com>
1 parent 9d169a8 commit c842d50

File tree

2 files changed

+10
-7
lines changed

2 files changed

+10
-7
lines changed

test/quantization/test_quant_api.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
import tempfile
5757
import gc
5858
from torch.testing._internal.common_utils import TestCase
59+
from torch.testing._internal import common_utils
5960

6061

6162
def dynamic_quant(model, example_inputs):
@@ -500,12 +501,13 @@ def test_eval_wrapper_llama3(self):
500501

501502
# TODO: move to a separate test file
502503
@unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
503-
def test_quantized_tensor_subclass_8da4w(self):
504+
@common_utils.parametrize("mapping_type", [MappingType.SYMMETRIC, MappingType.SYMMETRIC_NO_CLIPPING_ERR])
505+
def test_quantized_tensor_subclass_8da4w(self, mapping_type):
504506
group_size = 32
505507
m = ToyLinearModel().eval()
506508
m_copy = copy.deepcopy(m)
507509
example_inputs = m.example_inputs()
508-
quantize_(m, int8_dynamic_activation_int4_weight(group_size=group_size))
510+
quantize_(m, int8_dynamic_activation_int4_weight(group_size=group_size, mapping_type=mapping_type))
509511

510512
assert isinstance(m.linear1.weight, LinearActivationQuantizedTensor)
511513
assert isinstance(m.linear2.weight, LinearActivationQuantizedTensor)
@@ -516,7 +518,7 @@ def test_quantized_tensor_subclass_8da4w(self):
516518
from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
517519
from torchao.quantization.GPTQ import Int8DynActInt4WeightLinear
518520

519-
quantizer = Int8DynActInt4WeightQuantizer(groupsize=group_size)
521+
quantizer = Int8DynActInt4WeightQuantizer(groupsize=group_size, mapping_type=mapping_type)
520522
m_copy = quantizer.quantize(m_copy)
521523
assert isinstance(m_copy.linear1, Int8DynActInt4WeightLinear)
522524
assert isinstance(m_copy.linear2, Int8DynActInt4WeightLinear)
@@ -704,6 +706,8 @@ def reset_memory():
704706
assert param.is_cuda
705707
self.assertLess(memory_streaming, memory_baseline)
706708

709+
common_utils.instantiate_parametrized_tests(TestQuantFlow)
710+
707711

708712
if __name__ == "__main__":
709713
unittest.main()

torchao/quantization/quant_api.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -474,14 +474,13 @@ def _int8_asymm_per_token_quant(x: torch.Tensor) -> torch.Tensor:
474474
target_dtype = torch.int8
475475
return to_affine_quantized_intx(x, mapping_type, _get_per_token_block_size(x), target_dtype)
476476

477-
def apply_int8_dynamic_activation_int4_weight_quant(weight, group_size=32):
477+
def apply_int8_dynamic_activation_int4_weight_quant(weight, group_size=32, mapping_type=MappingType.SYMMETRIC):
478478
"""This is defined here instead of local function to support serialization
479479
"""
480480
if weight.shape[-1] % group_size != 0:
481481
return weight
482482

483483
# weight settings
484-
mapping_type = MappingType.SYMMETRIC
485484
block_size = (1, group_size)
486485
target_dtype = torch.int8
487486
eps = torch.finfo(torch.float32).eps
@@ -495,7 +494,7 @@ def apply_int8_dynamic_activation_int4_weight_quant(weight, group_size=32):
495494
weight = to_linear_activation_quantized(weight, input_quant_func)
496495
return weight
497496

498-
def int8_dynamic_activation_int4_weight(group_size=32):
497+
def int8_dynamic_activation_int4_weight(group_size=32, mapping_type=MappingType.SYMMETRIC):
499498
"""Applies int8 dynamic per token asymmetric activation quantization and int4 per group weight symmetric quantization to linear
500499
This is used to produce a model for executorch backend, but currently executorch did not
501500
support lowering for the quantized model from this flow yet
@@ -504,7 +503,7 @@ def int8_dynamic_activation_int4_weight(group_size=32):
504503
`group_size`: parameter for quantization, controls the granularity of quantization, smaller
505504
size is more fine grained
506505
"""
507-
return _get_linear_subclass_inserter(apply_int8_dynamic_activation_int4_weight_quant, group_size=group_size)
506+
return _get_linear_subclass_inserter(apply_int8_dynamic_activation_int4_weight_quant, group_size=group_size, mapping_type=mapping_type)
508507

509508

510509
def int4_weight_only(group_size=128, layout_type=TensorCoreTiledLayoutType(inner_k_tiles=8), use_hqq=False):

0 commit comments

Comments
 (0)