[NVFP4] Expand dynamic types, clean-up conditions (#325)

dsikka · brian-dellabetta · web-flow · commit 3f5705d81384 · 2025-05-28T17:36:39.000-04:00
* add DynamicType

* update to use tensor_group

* more condition clean-up

* update global scale creation

* fix conditions, fix tests

* add validation

* update/fix conditiosn

* Update src/compressed_tensors/quantization/lifecycle/initialize.py

Co-authored-by: Brian Dellabetta &lt;brian-dellabetta@users.noreply.github.com&gt;

* Update src/compressed_tensors/quantization/quant_args.py

Co-authored-by: Brian Dellabetta &lt;brian-dellabetta@users.noreply.github.com&gt;

* use explicit condition

---------

Co-authored-by: Brian Dellabetta &lt;brian-dellabetta@users.noreply.github.com&gt;
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -18,6 +18,7 @@
 
 import torch
 from compressed_tensors.quantization.quant_args import (
+    DynamicType,
     QuantizationArgs,
     QuantizationStrategy,
     QuantizationType,
@@ -190,8 +191,8 @@ def _process_quantization(
     group_size = args.group_size
 
     if args.strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
-        if args.strategy == QuantizationStrategy.TENSOR_GROUP:
-            # only valid for activation; remove dim 0
+        n_dims = x.shape
+        if len(n_dims) > 2:
             x = x.squeeze(0)
 
         output_dtype = dtype if dtype is not None else x.dtype
@@ -255,7 +256,7 @@ def _process_quantization(
         if not is_column_order:
             output = safe_permute(output, torch.argsort(perm), dim=1)
 
-        if args.strategy == QuantizationStrategy.TENSOR_GROUP:
+        if len(n_dims) > 2:
             output = output.unsqueeze(0)
 
     else:  # covers channel, token and tensor strategies
@@ -359,7 +360,7 @@ def forward_quantize(
     g_idx = getattr(module, "weight_g_idx", None)
     global_scale = getattr(module, f"{base_name}_global_scale", None)
 
-    if args.dynamic or args.strategy == QuantizationStrategy.TENSOR_GROUP:
+    if args.dynamic in (True, DynamicType.LOCAL):
         # dynamic quantization - determine the scale/zp on the fly
         scale, zero_point = compute_dynamic_scales_and_zp(
             value=value, args=args, module=module, global_scale=global_scale
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -156,13 +156,33 @@ def _initialize_scale_zero_point(
     force_zero_point: bool = True,
     scale_dtype: Optional[torch.dtype] = None,
 ):
-    if quantization_args.dynamic:
+    if quantization_args.dynamic is True:
         return
 
     # initialize on execution device to avoid performing quantized ops on cpu
     device = get_execution_device(module)
 
-    # infer expected scale/zero point shape
+    # 1. Create global_scales for tensor_group
+    if quantization_args.strategy == QuantizationStrategy.TENSOR_GROUP:
+        # TODO: should move to llmcompressor
+        if base_name == "weight":
+            # When applying weight-only FP4 quantization, generate a global_scale
+            # This scale is applied during runtime to ensure that the generated
+            # local scale falls properly within the FP8 range (i.e max value is FP8_max)
+            # which is the expected dtype of NVFP4A16 scales
+            value = generate_global_scale(input_tensor=module.weight)
+            value = value.to(device)
+            init_global_scale = Parameter(value, requires_grad=False)
+        else:
+            init_global_scale = Parameter(
+                torch.empty(1, dtype=torch.float32, device=device),
+                requires_grad=False,
+            )
+        register_offload_parameter(
+            module, f"{base_name}_global_scale", init_global_scale
+        )
+
+    # 2. Infer expected scale/zero point shape
     if quantization_args.strategy == QuantizationStrategy.TOKEN:
         expected_shape = (1, 1)
     else:
@@ -172,55 +192,35 @@ def _initialize_scale_zero_point(
         if quantization_args.strategy == QuantizationStrategy.CHANNEL:
             # (output_channels, 1)
             expected_shape = (weight_shape[0], 1)
-        elif quantization_args.strategy == QuantizationStrategy.GROUP:
+        elif quantization_args.strategy in (
+            QuantizationStrategy.TENSOR_GROUP,
+            QuantizationStrategy.GROUP,
+        ):
             num_groups = math.ceil(weight_shape[1] / quantization_args.group_size)
             expected_shape = (weight_shape[0], max(num_groups, 1))
 
+    # 3. Identify quantization scale and zp dtype
     scale_dtype = scale_dtype if scale_dtype is not None else module.weight.dtype
-    # TODO: consider erroring out in the future as if the dtype if not one fo these,
-    # there is likely bug
-
-    if is_fp4(quantization_args=quantization_args) and base_name == "weight":
-        assert quantization_args.strategy == QuantizationStrategy.GROUP
-        scale_dtype = FP8_E4M3_DATA.dtype
-        # When applying weight-only FP4 quantization, generate a global_scale
-        # This scale is applied during runtime to ensure that the generated
-        # local scale falls properly within the FP8 range (i.e max value is FP8_max)
-        # which is the expected dtype of NVFP4A16 scales
-        value = generate_global_scale(input_tensor=module.weight)
-        value = value.to(device)
-        init_global_scale = Parameter(value, requires_grad=False)
-        register_offload_parameter(
-            module, f"{base_name}_global_scale", init_global_scale
-        )
 
-    # initializes empty scale, zero point, and g_idx parameters for the module
-    if is_fp4(quantization_args=quantization_args) and base_name == "input":
-        assert quantization_args.strategy == QuantizationStrategy.TENSOR_GROUP
-        scale_dtype = torch.float32
-        scale_name = f"{base_name}_global_scale"
+    if is_fp4(quantization_args=quantization_args):
+        scale_dtype = zp_dtype = FP8_E4M3_DATA.dtype
     else:
-        scale_name = f"{base_name}_scale"
-
-    if scale_dtype not in [
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ] and not is_fp4(quantization_args=quantization_args):
-        scale_dtype = torch.float16
-
-    init_scale = Parameter(
-        torch.empty(expected_shape, dtype=scale_dtype, device=device),
-        requires_grad=False,
-    )
-    register_offload_parameter(module, scale_name, init_scale)
+        # TODO: consider erroring out in the future as if the dtype if not one of these,
+        # there is likely bug
+        if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32]:
+            scale_dtype = torch.float16
+        zp_dtype = quantization_args.pytorch_dtype()
+
+    # 4. Initializes empty scale, zero point, and g_idx parameters for the module
+    # do not init scales for quantzation_args.dynamic == DynamicType.local
+    if not quantization_args.dynamic:
+        init_scale = Parameter(
+            torch.empty(expected_shape, dtype=scale_dtype, device=device),
+            requires_grad=False,
+        )
+        register_offload_parameter(module, f"{base_name}_scale", init_scale)
 
     if force_zero_point or not quantization_args.symmetric:
-        if is_fp4(quantization_args=quantization_args):
-            zp_dtype = FP8_E4M3_DATA.dtype
-        else:
-            zp_dtype = quantization_args.pytorch_dtype()
-
         init_zero_point = Parameter(
             torch.zeros(expected_shape, device=device, dtype=zp_dtype),
             requires_grad=False,
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -32,6 +32,7 @@
     "QuantizationArgs",
     "round_to_quantized_type",
     "ActivationOrdering",
+    "DynamicType",
 ]
 
 
@@ -101,6 +102,21 @@ class QuantizationStrategy(str, Enum):
     TENSOR_GROUP = "tensor_group"
 
 
+class DynamicType(str, Enum):
+    """
+    Enum storing potential dynamic types.
+
+    1. If dynamic is True, all quantization parameters are generated on the fly.
+    2. If dynamic is False, all quantization parameters generated are static.
+    3. If "local" is provided, only local quantization parameters are dynamic.
+
+    Note: "local" is only currently supported for NVFP4.
+
+    """
+
+    LOCAL = "local"
+
+
 class ActivationOrdering(Aliasable, str, Enum):
     """
     Enum storing strategies for activation ordering
@@ -153,7 +169,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True):
     group_size: Optional[int] = None
     strategy: Optional[QuantizationStrategy] = None
     block_structure: Optional[str] = None
-    dynamic: bool = False
+    dynamic: Union[DynamicType, bool] = False
     actorder: Union[ActivationOrdering, bool, None] = None
     observer: Optional[str] = Field(
         default=None,
@@ -207,6 +223,12 @@ def validate_actorder(cls, value) -> Optional[ActivationOrdering]:
 
         return value
 
+    @field_validator("dynamic", mode="before")
+    def validate_dynamic(cls, value) -> Union[DynamicType, bool]:
+        if isinstance(value, str):
+            return DynamicType(value.lower())
+        return value
+
     @model_validator(mode="after")
     def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
         # extract user-passed values from dictionary
@@ -257,18 +279,31 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
             if strategy not in (
                 QuantizationStrategy.TOKEN,
                 QuantizationStrategy.TENSOR,
+                QuantizationStrategy.TENSOR_GROUP,
             ):
                 raise ValueError(
-                    f"One of {QuantizationStrategy.TOKEN} or "
-                    f"{QuantizationStrategy.TENSOR} must be used for dynamic ",
-                    "quantization",
+                    f"One of {(QuantizationStrategy.TOKEN, QuantizationStrategy.TENSOR, QuantizationStrategy.TENSOR_GROUP)} "
+                    "must be used for dynamic quantization",
                 )
+
+            if (
+                dynamic == DynamicType.LOCAL
+                and strategy != QuantizationStrategy.TENSOR_GROUP
+            ):
+                raise ValueError("local is only supported for strategy tensor_group")
+
             if observer is not None:
-                if observer != "memoryless":  # avoid annoying users with old configs
-                    warnings.warn(
-                        "No observer is used for dynamic quantization, setting to None"
-                    )
-                observer = None
+                if dynamic is True:  # checking if dynamic is True, not "local"
+                    if (
+                        observer != "memoryless"
+                    ):  # avoid annoying users with old configs
+                        warnings.warn(
+                            "No observer is used for dynamic quantization, setting to None"
+                        )
+                    observer = None
+            else:
+                if dynamic == DynamicType.LOCAL:
+                    observer = "minmax"
 
         elif observer is None:
             # default to minmax for non-dynamic cases
diff --git a/src/compressed_tensors/quantization/quant_config.py b/src/compressed_tensors/quantization/quant_config.py
@@ -16,7 +16,7 @@
 from typing import Dict, List, Optional, Union
 
 from compressed_tensors.config import CompressionFormat
-from compressed_tensors.quantization.quant_args import QuantizationArgs
+from compressed_tensors.quantization.quant_args import DynamicType, QuantizationArgs
 from compressed_tensors.quantization.quant_scheme import (
     QuantizationScheme,
     preset_name_to_scheme,
@@ -251,7 +251,7 @@ def requires_calibration_data(self):
 
         for _, scheme in self.config_groups.items():
             if scheme.input_activations is not None:
-                if not scheme.input_activations.dynamic:
+                if scheme.input_activations.dynamic in (False, DynamicType.LOCAL):
                     return True
             if scheme.output_activations is not None:
                 if not scheme.output_activations.dynamic:
diff --git a/src/compressed_tensors/quantization/quant_scheme.py b/src/compressed_tensors/quantization/quant_scheme.py
@@ -16,6 +16,7 @@
 from typing import Any, Dict, List, Optional
 
 from compressed_tensors.quantization.quant_args import (
+    DynamicType,
     QuantizationArgs,
     QuantizationStrategy,
     QuantizationType,
@@ -104,22 +105,19 @@ def is_preset_scheme(name: str) -> bool:
     weights=QuantizationArgs(
         num_bits=4,
         type=QuantizationType.FLOAT,
-        strategy=QuantizationStrategy.GROUP,
+        strategy=QuantizationStrategy.TENSOR_GROUP,
         symmetric=True,
         dynamic=False,
         group_size=16,
     )
 )
 
-# TODO: the local scales are dynamic, the global scale is static/calibrated
-# We could potentially extend the dynamic kwarg so that is goes
-# beyond being just a boolean - however we may also want a dynamically
-# generated global scale, so we could use that to separate between the two
+
 NVFP4 = dict(
     weights=QuantizationArgs(
         num_bits=4,
         type=QuantizationType.FLOAT,
-        strategy=QuantizationStrategy.GROUP,
+        strategy=QuantizationStrategy.TENSOR_GROUP,
         symmetric=True,
         dynamic=False,
         group_size=16,
@@ -129,7 +127,7 @@ def is_preset_scheme(name: str) -> bool:
         type=QuantizationType.FLOAT,
         strategy=QuantizationStrategy.TENSOR_GROUP,
         symmetric=True,
-        dynamic=False,
+        dynamic=DynamicType.LOCAL,
         group_size=16,
     ),
 )
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -171,10 +171,10 @@ def compute_dynamic_scales_and_zp(
     elif args.strategy == QuantizationStrategy.TENSOR:
         reduce_dims = None
     elif args.strategy == QuantizationStrategy.TENSOR_GROUP:
-        # per group dynamic quantization - only valid for
-        # activations
+        if len(value.shape) > 2:
+            value = value.squeeze(0)
+
         dim = {0, 1}
-        value = value.squeeze(0)
         reduce_dims = tuple(idx for idx in range(3) if idx not in dim)
         keep_dims = False
         value = torch.reshape(
diff --git a/tests/test_quantization/lifecycle/test_initialize.py b/tests/test_quantization/lifecycle/test_initialize.py
@@ -156,9 +156,23 @@ def test_initialize_module_for_quantization_offloaded(
             None,
         ),
         (
-            QuantizationArgs(strategy="group", group_size=16, type="float", num_bits=4),
+            QuantizationArgs(
+                strategy="tensor_group", group_size=16, type="float", num_bits=4
+            ),
             None,
         ),
+        (
+            QuantizationArgs(
+                strategy="tensor_group", group_size=16, type="float", num_bits=4
+            ),
+            QuantizationArgs(
+                strategy="tensor_group",
+                group_size=16,
+                type="float",
+                num_bits=4,
+                dynamic="local",
+            ),
+        ),
         (
             QuantizationArgs(strategy="block"),
             QuantizationArgs(strategy="block"),
@@ -184,13 +198,19 @@ def test_initialize_quantization_parameters(weights, input_activations):
             continue
         q_param_name = Q_PARAM_NAMES[q_type]
 
-        if args.num_bits == 4 and args.type == QuantizationType.FLOAT:
-            assert hasattr(layer, "weight_global_scale")
-            assert layer.weight_global_scale.dtype == torch.float32
-            assert layer.weight_global_scale.numel() == 1
-            assert layer.weight_scale.dtype == FP8_E4M3_DATA.dtype
+        if args.strategy == QuantizationStrategy.TENSOR_GROUP:
+            if q_type == "weights":
+                assert hasattr(layer, "weight_global_scale")
+                assert layer.weight_global_scale.dtype == torch.float32
+                assert layer.weight_global_scale.numel() == 1
+                assert layer.weight_scale.dtype == FP8_E4M3_DATA.dtype
+            elif q_type == "input_activations":
+                assert hasattr(layer, "input_global_scale")
+                assert layer.input_global_scale.dtype == torch.float32
+                assert layer.input_global_scale.numel() == 1
         else:
             assert not hasattr(layer, "weight_global_scale")
+            assert not hasattr(layer, "input_global_scale")
 
         # scale and zero point
         if args.strategy == QuantizationStrategy.TENSOR:
@@ -199,7 +219,10 @@ def test_initialize_quantization_parameters(weights, input_activations):
         elif args.strategy == QuantizationStrategy.CHANNEL:  # only weight
             expected_shape = (layer.weight.shape[0], 1)
 
-        elif args.strategy == QuantizationStrategy.GROUP:  # only weight
+        elif args.strategy in (
+            QuantizationStrategy.TENSOR_GROUP,
+            QuantizationStrategy.GROUP,
+        ):
             num_groups = math.ceil(layer.weight.shape[1] / args.group_size)
             expected_shape = (layer.weight.shape[0], max(num_groups, 1))
 
@@ -209,8 +232,9 @@ def test_initialize_quantization_parameters(weights, input_activations):
         elif args.strategy == QuantizationStrategy.TOKEN:
             expected_shape = (1, 1)
 
-        assert getattr(layer, f"{q_param_name}_scale").shape == expected_shape
-        assert getattr(layer, f"{q_param_name}_zero_point").shape == expected_shape
+        if not args.dynamic:
+            assert getattr(layer, f"{q_param_name}_scale").shape == expected_shape
+            assert getattr(layer, f"{q_param_name}_zero_point").shape == expected_shape
 
         # g_idx
         if args.actorder == ActivationOrdering.GROUP: