[NVFP4] Expand observers to calculate gparam, support NVFP4 Activations (#1487)

dsikka · rahul-tuli · web-flow · commit 61e5c006b2b0 · 2025-06-02T21:55:03.000-04:00
# SUMMARY:
- Add NVFP4 Example
- Update compression condition to no longer be weight only
- Support NVFP4 Activations:
- Update observers to also provide the option to calculate gparam
(global_param), not just qparams
- Update dynamic activation condition checks to consider
DynamicType.LOCAL
    
# Testing
- All test cases pass

# Next Steps:
We now have the framework to also calculate the weight global scale in
llmcompressor. Will remove it from compressed-tensors and add it here
onc this lands.

---------

Co-authored-by: Rahul Tuli &lt;rtuli@redhat.com&gt;
diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py
@@ -0,0 +1,73 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+# Load model.
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 20
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# Configure the quantization algorithm and scheme.
+# In this case, we:
+#   * quantize the weights to fp4 with per group 16 via ptq
+#   * calibrate a global_scale for activations, which will be used to
+#       quantize activations to fp4 on the fly
+recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"])
+
+# Apply quantization.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -2,9 +2,11 @@
 
 import torch
 from compressed_tensors.quantization import (
+    DynamicType,
     KVCacheScaleType,
     QuantizationScheme,
     QuantizationStatus,
+    QuantizationStrategy,
 )
 from compressed_tensors.quantization.lifecycle.forward import forward_quantize
 from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme
@@ -53,7 +55,10 @@ def initialize_observer(
 
     quantization_args = getattr(quantization_scheme, arg_name, None)
     # dont need observers for dynamic
-    if quantization_args is not None and not quantization_args.dynamic:
+    if quantization_args is not None and quantization_args.dynamic in (
+        False,
+        DynamicType.LOCAL,
+    ):
         observer = Observer.load_from_registry(
             quantization_args.observer,
             quantization_args=quantization_args,
@@ -84,14 +89,43 @@ def call_observer(module: Module, base_name: str, value: Optional[torch.Tensor]
                 "Must provide a value to observe if not using weight observer"
             )
 
+        quantization_scheme = getattr(module, "quantization_scheme", None)
+        arg_name = "weights" if base_name == "weight" else f"{base_name}_activations"
+        quant_args = getattr(quantization_scheme, arg_name, None)
+
+        # We always calculate quantizaton parameters by default and no global parameters
+        should_calculate_gparam = False
+        should_calculate_qparams = True
+
+        # TODO: will update to be the case for both weight and input in a follow-up
+        # weight global calculate is currently done in ct right now;
+        # should be moved here to unify global scale calculations
+        if (
+            quant_args.strategy == QuantizationStrategy.TENSOR_GROUP
+            and base_name == "input"
+        ):
+            should_calculate_gparam = True
+            should_calculate_qparams = False
+
         observer = getattr(module, f"{base_name}_observer")
-        updated_scale, updated_zero_point = observer(
-            value, g_idx=g_idx, global_scale=global_scale
+        observer_outputs = observer(
+            value,
+            g_idx=g_idx,
+            global_scale=global_scale,
+            should_calculate_gparam=should_calculate_gparam,
         )
 
-        # update scale and zero point
-        update_parameter_data(module, updated_scale, f"{base_name}_scale")
-        update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
+        if should_calculate_gparam:
+            updated_global_scale = observer_outputs
+            update_parameter_data(
+                module, updated_global_scale, f"{base_name}_global_scale"
+            )
+
+        if should_calculate_qparams:
+            # update scale and zero point
+            updated_scale, updated_zero_point = observer_outputs
+            update_parameter_data(module, updated_scale, f"{base_name}_scale")
+            update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
 
 
 def update_weight_zp_scale(module: Module):
diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py
@@ -2,6 +2,7 @@
 
 import torch
 from compressed_tensors.quantization import (
+    DynamicType,
     QuantizationArgs,
     QuantizationConfig,
     QuantizationScheme,
@@ -212,7 +213,10 @@ def _initialize_observers(self, module: torch.nn.Module):
             return
 
         scheme: QuantizationScheme = module.quantization_scheme
-        input = scheme.input_activations and not scheme.input_activations.dynamic
+        input = scheme.input_activations and scheme.input_activations.dynamic in (
+            False,
+            DynamicType.LOCAL,
+        )
         weight = scheme.weights is not None
         output = scheme.output_activations and not scheme.output_activations.dynamic
         is_attention = is_attention_module(module)
@@ -241,7 +245,10 @@ def _initialize_hooks(self, model: torch.nn.Module) -> Set[RemovableHandle]:
                 continue
 
             scheme: QuantizationScheme = module.quantization_scheme
-            input = scheme.input_activations and not scheme.input_activations.dynamic
+            input = scheme.input_activations and scheme.input_activations.dynamic in (
+                False,
+                DynamicType.LOCAL,
+            )
             output = scheme.output_activations and not scheme.output_activations.dynamic
             is_attention = is_attention_module(module)
 
diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py
@@ -40,6 +40,7 @@ def forward(
         observed: Tensor,
         g_idx: Optional[Tensor] = None,
         global_scale: Optional[Tensor] = None,
+        should_calculate_gparam: bool = False,
     ) -> Tuple[FloatTensor, IntTensor]:
         """
         maps directly to get_qparams
@@ -50,8 +51,12 @@ def forward(
         :return: tuple of scale and zero point based on last observed value
         """
         self.record_observed_tokens(observed)
+        if should_calculate_gparam:
+            return self.get_gparam(observed=observed)
         return self.get_qparams(
-            observed=observed, g_idx=g_idx, global_scale=global_scale
+            observed=observed,
+            g_idx=g_idx,
+            global_scale=global_scale,
         )
 
     def calculate_qparams(
@@ -68,11 +73,34 @@ def calculate_qparams(
         """
         raise NotImplementedError(f"{self.__class__} must implement calculate_qparams")
 
+    def calculate_gparam(
+        self,
+        observed: Tensor,
+    ) -> torch.Tensor:
+        """
+        :param observed: observed tensor to calculate quantization parameters for
+        :return: global scale derived from the observed tensor
+        """
+        raise NotImplementedError(f"{self.__class__} must implement calculate_gparam")
+
     def post_calculate_qparams(self) -> None:
         """
         Run any logic specific to its observers after running calculate_qparams
         """
 
+    def get_gparam(self, observed: Tensor):
+        """
+        Function to derive a global scale parameter
+        :param observed: observed tensor to calculate global parameters
+            from
+        :return: derived global scale
+        """
+        if self.quantization_args.strategy == QuantizationStrategy.TENSOR_GROUP:
+            return self.calculate_gparam(observed)
+        raise NotImplementedError(
+            "global parameter generation is only supported for TENSOR_GROUP"
+        )
+
     def get_qparams(
         self,
         observed: Optional[Tensor] = None,
diff --git a/src/llmcompressor/observers/helpers.py b/src/llmcompressor/observers/helpers.py
@@ -1,8 +1,14 @@
 from collections import Counter
+from typing import Optional
 
 import torch
+from compressed_tensors.quantization.quant_args import (
+    FP4_E2M1_DATA,
+    FP8_E4M3_DATA,
+    FloatArgs,
+)
 
-__all__ = ["get_observer_token_count"]
+__all__ = ["get_observer_token_count", "generate_gparam"]
 
 
 def get_observer_token_count(module: torch.nn.Module) -> Counter:
@@ -20,3 +26,29 @@ def get_observer_token_count(module: torch.nn.Module) -> Counter:
                 module._num_observed_tokens
             )
     return token_counts
+
+
+# TODO: we have a similar function in ct already
+# consolidate when adding weight global scale
+# generation
+def generate_gparam(
+    updated_min_val: torch.Tensor,
+    updated_max_val: torch.Tensor,
+    scale_data: Optional[FloatArgs] = FP8_E4M3_DATA,
+    quant_data: Optional[FloatArgs] = FP4_E2M1_DATA,
+    dtype: Optional[torch.dtype] = torch.float32,
+):
+    """
+    Generate a global scale for an entire tensor (input_tensor).
+    Goal of the scale is to ensure that the quantization (local) scale
+    falls into the approproiate dtype range.
+
+    E.g. for NVFP4, group (local) scales are in dtype FP8. The global_scale
+    attempts to use the entire FP8 dtype range while mapping a per-group max
+    to the FP4 max.
+    """
+    min_vals = torch.min(updated_min_val, torch.zeros_like(updated_min_val))
+    max_vals = torch.max(updated_max_val, torch.zeros_like(updated_max_val))
+    max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
+    global_scale = scale_data.max * quant_data.max / max_val_pos
+    return global_scale.to(dtype)
diff --git a/src/llmcompressor/observers/min_max.py b/src/llmcompressor/observers/min_max.py
@@ -6,6 +6,7 @@
 from compressed_tensors.utils import deprecated
 
 from llmcompressor.observers.base import Observer
+from llmcompressor.observers.helpers import generate_gparam
 
 __all__ = ["MinMaxObserver", "MovingAverageMinMaxObserver"]
 
@@ -29,13 +30,12 @@ def __init__(
         self.max_val = {}
         self.averaging_constant = averaging_constant
 
-    def calculate_qparams(
+    def calculate_updated_min_max(
         self,
         observed: torch.Tensor,
         reduce_dims: Optional[Tuple[int]] = None,
         tensor_id: Optional[Any] = None,
-        global_scale: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.FloatTensor, torch.IntTensor]:
+    ):
         """
         Updates the observed min and max using a moving average smoothed by the
         averaging_constant. Set the averaging_constant to 1.0 to disable averaging.
@@ -46,8 +46,7 @@ def calculate_qparams(
             reduced dimensions
         :param tensor_id: Optional id if different ranges of observed tensors are
             passed, useful for sharding tensors by group_size
-        :param global_scale: optional scale to further scale local quantization scales
-        :return: tuple of scale and zero point derived from the observed tensor
+        :return: updated min and max values
         """
         tensor_id = tensor_id or "default"
 
@@ -59,12 +58,7 @@ def calculate_qparams(
 
         # early stopping, save some computation and memory
         if self.averaging_constant == 1.0:
-            return calculate_qparams(
-                min_vals=min_val,
-                max_vals=max_val,
-                quantization_args=self.quantization_args,
-                global_scale=global_scale,
-            )
+            return min_val, max_val
 
         running_min_val = self.min_val.get(tensor_id, None)
         running_max_val = self.max_val.get(tensor_id, None)
@@ -82,7 +76,46 @@ def calculate_qparams(
 
         self.min_val[tensor_id] = updated_min_val
         self.max_val[tensor_id] = updated_max_val
+        return updated_min_val, updated_max_val
+
+    def calculate_gparam(self, observed: torch.Tensor) -> torch.Tensor:
+        """
+        Generate a global scale using the observed min and max.
 
+        :param observed: observed tensor to calculate quantization parameters for
+        :return: updated global scale derived from the observed tensor
+        """
+
+        updated_min_val, updated_max_val = self.calculate_updated_min_max(
+            observed=observed
+        )
+        return generate_gparam(
+            updated_min_val=updated_min_val, updated_max_val=updated_max_val
+        )
+
+    def calculate_qparams(
+        self,
+        observed: torch.Tensor,
+        reduce_dims: Optional[Tuple[int]] = None,
+        tensor_id: Optional[Any] = None,
+        global_scale: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.FloatTensor, torch.IntTensor]:
+        """
+        Generate a scale and zero-point using the observed min and max.
+
+        :param observed: observed tensor to calculate quantization parameters for
+        :param reduce_dims: optional tuple of dimensions to reduce along,
+            returned scale and zero point will be shaped (1,) along the
+            reduced dimensions
+        :param tensor_id: Optional id if different ranges of observed tensors are
+            passed, useful for sharding tensors by group_size
+        :param global_scale: optional scale to further scale local quantization scales
+        :return: tuple of scale and zero point derived from the observed tensor
+        """
+
+        updated_min_val, updated_max_val = self.calculate_updated_min_max(
+            observed=observed, tensor_id=tensor_id, reduce_dims=reduce_dims
+        )
         return calculate_qparams(
             min_vals=updated_min_val,
             max_vals=updated_max_val,
diff --git a/src/llmcompressor/observers/mse.py b/src/llmcompressor/observers/mse.py
diff --git a/src/llmcompressor/transformers/compression/quantization_format.py b/src/llmcompressor/transformers/compression/quantization_format.py