[NVFP4] Add tensor_group strategy; enable NVFP4 Activations (#317)

dsikka · web-flow · commit eddd2a1b5fe5 · 2025-05-27T22:04:53.000-04:00
* add nvfp4 args

* format

* dont use a dataclass

* remove dataclass

* test skeletons

* add tests

* update

* introduce new tensor_group strategy; expand dynamic forward pass to do per group; add input_global_scale

* update

* update dynamic scale generation

* Update test_initialize.py

* clean up

* more clean-up

* add additional checks

* Update forward.py

* Update forward.py

* edit

* remove unused import

* use ceil
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -189,7 +189,11 @@ def _process_quantization(
     q_min, q_max = calculate_range(args, x.device)
     group_size = args.group_size
 
-    if args.strategy == QuantizationStrategy.GROUP:
+    if args.strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
+        if args.strategy == QuantizationStrategy.TENSOR_GROUP:
+            # only valid for activation; remove dim 0
+            x = x.squeeze(0)
+
         output_dtype = dtype if dtype is not None else x.dtype
         output = torch.zeros_like(x).to(output_dtype)
         columns = output.shape[1]
@@ -251,6 +255,9 @@ def _process_quantization(
         if not is_column_order:
             output = safe_permute(output, torch.argsort(perm), dim=1)
 
+        if args.strategy == QuantizationStrategy.TENSOR_GROUP:
+            output = output.unsqueeze(0)
+
     else:  # covers channel, token and tensor strategies
         if do_quantize:
             output = _quantize(
@@ -352,9 +359,11 @@ def forward_quantize(
     g_idx = getattr(module, "weight_g_idx", None)
     global_scale = getattr(module, f"{base_name}_global_scale", None)
 
-    if args.dynamic:
+    if args.dynamic or args.strategy == QuantizationStrategy.TENSOR_GROUP:
         # dynamic quantization - determine the scale/zp on the fly
-        scale, zero_point = compute_dynamic_scales_and_zp(value=value, args=args)
+        scale, zero_point = compute_dynamic_scales_and_zp(
+            value=value, args=args, module=module, global_scale=global_scale
+        )
     else:
         # static quantization - get scale and zero point from layer
         scale = getattr(module, f"{base_name}_scale")
@@ -388,6 +397,7 @@ def _quantize(
         scale = scale.to(global_scale.dtype) / global_scale
 
     scaled = x / scale
+
     if zero_point is not None:
         scaled += zero_point.to(x.dtype)
 
@@ -398,6 +408,7 @@ def _quantize(
         q_max,
     )
     quantized_value = round_to_quantized_type(clamped_value, args)
+
     if dtype is not None:
         quantized_value = quantized_value.to(dtype)
 
@@ -422,6 +433,7 @@ def _dequantize(
 
     if zero_point is not None:
         dequant_value = dequant_value - zero_point.to(scale.dtype)
+
     dequant_value = dequant_value * scale
 
     if dtype is not None:
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -181,6 +181,7 @@ def _initialize_scale_zero_point(
     # there is likely bug
 
     if is_fp4(quantization_args=quantization_args) and base_name == "weight":
+        assert quantization_args.strategy == QuantizationStrategy.GROUP
         scale_dtype = FP8_E4M3_DATA.dtype
         # When applying weight-only FP4 quantization, generate a global_scale
         # This scale is applied during runtime to ensure that the generated
@@ -193,19 +194,26 @@ def _initialize_scale_zero_point(
             module, f"{base_name}_global_scale", init_global_scale
         )
 
+    # initializes empty scale, zero point, and g_idx parameters for the module
+    if is_fp4(quantization_args=quantization_args) and base_name == "input":
+        assert quantization_args.strategy == QuantizationStrategy.TENSOR_GROUP
+        scale_dtype = torch.float32
+        scale_name = f"{base_name}_global_scale"
+    else:
+        scale_name = f"{base_name}_scale"
+
     if scale_dtype not in [
         torch.float16,
         torch.bfloat16,
         torch.float32,
     ] and not is_fp4(quantization_args=quantization_args):
         scale_dtype = torch.float16
 
-    # initializes empty scale, zero point, and g_idx parameters for the module
     init_scale = Parameter(
         torch.empty(expected_shape, dtype=scale_dtype, device=device),
         requires_grad=False,
     )
-    register_offload_parameter(module, f"{base_name}_scale", init_scale)
+    register_offload_parameter(module, scale_name, init_scale)
 
     if force_zero_point or not quantization_args.symmetric:
         if is_fp4(quantization_args=quantization_args):
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -98,6 +98,7 @@ class QuantizationStrategy(str, Enum):
     GROUP = "group"
     BLOCK = "block"
     TOKEN = "token"
+    TENSOR_GROUP = "tensor_group"
 
 
 class ActivationOrdering(Aliasable, str, Enum):
@@ -239,7 +240,8 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
         if (
             group_size is not None
             and group_size > 0
-            and strategy != QuantizationStrategy.GROUP
+            and strategy
+            not in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP)
         ):
             raise ValueError("group_size requires strategy to be set to 'group'")
 
diff --git a/src/compressed_tensors/quantization/quant_scheme.py b/src/compressed_tensors/quantization/quant_scheme.py
@@ -111,6 +111,29 @@ def is_preset_scheme(name: str) -> bool:
     )
 )
 
+# TODO: the local scales are dynamic, the global scale is static/calibrated
+# We could potentially extend the dynamic kwarg so that is goes
+# beyond being just a boolean - however we may also want a dynamically
+# generated global scale, so we could use that to separate between the two
+NVFP4 = dict(
+    weights=QuantizationArgs(
+        num_bits=4,
+        type=QuantizationType.FLOAT,
+        strategy=QuantizationStrategy.GROUP,
+        symmetric=True,
+        dynamic=False,
+        group_size=16,
+    ),
+    input_activations=QuantizationArgs(
+        num_bits=4,
+        type=QuantizationType.FLOAT,
+        strategy=QuantizationStrategy.TENSOR_GROUP,
+        symmetric=True,
+        dynamic=False,
+        group_size=16,
+    ),
+)
+
 # 8 bit integer weights and 8 bit activations quantization
 INT8_W8A8 = dict(
     weights=QuantizationArgs(
@@ -237,4 +260,5 @@ def is_preset_scheme(name: str) -> bool:
     "FP8": FP8,
     "FP8_DYNAMIC": FP8_DYNAMIC,
     "NVFP4A16": NVFP4A16,
+    "NVFP4": NVFP4,
 }
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import logging
+import math
 from typing import Generator, List, Optional, Tuple
 
 import torch
@@ -103,7 +104,9 @@ def calculate_qparams(
         if is_fp4(quantization_args=quantization_args) and global_scale is not None:
             # Conditionally scale the generated local scale by a global_scale
             scales = global_scale * (max_val_pos / FP4_E2M1_DATA.max)
+            scales = torch.clamp(scales, max=FP8_E4M3_DATA.max, min=FP8_E4M3_DATA.min)
             scales = scales.to(FP8_E4M3_DATA.dtype)
+
         else:
             scales = max_val_pos / (float(bit_range) / 2)
 
@@ -143,7 +146,12 @@ def calculate_qparams(
     return scales, zero_points
 
 
-def compute_dynamic_scales_and_zp(value: Tensor, args: QuantizationArgs):
+def compute_dynamic_scales_and_zp(
+    value: Tensor,
+    args: QuantizationArgs,
+    module: torch.nn.Module,
+    global_scale: Optional[Tensor] = None,
+):
     """
     Returns the computed scales and zero points for dynamic activation
     quantization.
@@ -155,24 +163,41 @@ def compute_dynamic_scales_and_zp(value: Tensor, args: QuantizationArgs):
         reduced dimensions
     :return: tuple of scale and zero point derived from the observed tensor
     """
+
+    keep_dims = True
     if args.strategy == QuantizationStrategy.TOKEN:
         dim = {1, 2}
         reduce_dims = tuple(idx for idx in range(value.ndim) if idx not in dim)
     elif args.strategy == QuantizationStrategy.TENSOR:
         reduce_dims = None
+    elif args.strategy == QuantizationStrategy.TENSOR_GROUP:
+        # per group dynamic quantization - only valid for
+        # activations
+        dim = {0, 1}
+        value = value.squeeze(0)
+        reduce_dims = tuple(idx for idx in range(3) if idx not in dim)
+        keep_dims = False
+        value = torch.reshape(
+            value,
+            (
+                value.shape[0],
+                math.ceil(value.shape[1] / args.group_size),
+                args.group_size,
+            ),
+        )
     else:
         raise ValueError(
-            f"One of {QuantizationStrategy.TOKEN} or {QuantizationStrategy.TENSOR} ",
-            "must be used for dynamic quantization",
+            "Dynamic quantization is only supported for ",
+            f"{QuantizationStrategy.TOKEN, QuantizationStrategy.TENSOR, QuantizationStrategy.TENSOR_GROUP}",
         )
 
     if not reduce_dims:
         min_val, max_val = torch.aminmax(value)
     else:
-        min_val = torch.amin(value, dim=reduce_dims, keepdims=True)
-        max_val = torch.amax(value, dim=reduce_dims, keepdims=True)
+        min_val = torch.amin(value, dim=reduce_dims, keepdims=keep_dims)
+        max_val = torch.amax(value, dim=reduce_dims, keepdims=keep_dims)
 
-    return calculate_qparams(min_val, max_val, args)
+    return calculate_qparams(min_val, max_val, args, global_scale=global_scale)
 
 
 def calculate_range(quantization_args: QuantizationArgs, device: str) -> Tuple: