[NVFP4] Update global scale generation (#1508)

dsikka · web-flow · commit 559ad81cadd7 · 2025-06-03T15:52:04.000-04:00
# SUMMARY: - Reopening as #1504 merged through automerge with just one review - Requires: neuralmagic/compressed-tensors#339 - Uses observers to generate global weight scales; these were previously being generated during the init function in compressed-tensors however, using observers is more consistent with our workflows and parameter lifecycle - Also moves in the fused layer update step to llmcompressor - this can be removed once we have an update from vLLM. However, right now this requires us to split up the update_weight_global_scale and weight_weight_zp_scale steps - these can be combined once the vLLM change is made - Update examples to include sample generation - this is now very quick thanks to this PR: neuralmagic/compressed-tensors#336 Note: The mse observer is very much tied to generating a scale and zero-point so it can't be used for global scale generation at the moment. We will have to decouple this functionality in order to support general scale optimization # TEST PLAN: - Tested e2e with nvfp4 and nvfp4a16 - Validated existing workflows work e2e (w4a16, spaarse2of4 + fp8, w8a8 int8/fp8)
diff --git a/examples/quantization_w4a16_fp4/llama3_example.py b/examples/quantization_w4a16_fp4/llama3_example.py
@@ -19,6 +19,14 @@
 # Apply quantization.
 oneshot(model=model, recipe=recipe)
 
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+
 # Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4A16"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
diff --git a/examples/quantization_w4a4_fp4/llama3_example.py b/examples/quantization_w4a4_fp4/llama3_example.py
@@ -67,6 +67,14 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+
 # Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -29,6 +29,7 @@
     "freeze_module_quantization",
     "apply_calibration_status",
     "reset_quantization_status",
+    "update_weight_global_scale",
 ]
 
 
@@ -66,7 +67,13 @@ def initialize_observer(
         module.register_module(f"{base_name}_observer", observer)
 
 
-def call_observer(module: Module, base_name: str, value: Optional[torch.Tensor] = None):
+def call_observer(
+    module: Module,
+    base_name: str,
+    value: Optional[torch.Tensor] = None,
+    should_calculate_gparam: bool = False,
+    should_calculate_qparams: bool = True,
+):
     """
     Call a module's attached input/weight/output observer using a provided value.
     Update the module's scale and zp using the observer's return values.
@@ -80,54 +87,51 @@ def call_observer(module: Module, base_name: str, value: Optional[torch.Tensor]
         if base_name == "weight":
             value = module.weight
             g_idx = getattr(module, "weight_g_idx", None)
-            global_scale = getattr(module, f"{base_name}_global_scale", None)
         elif value is not None:
             g_idx = None
-            global_scale = None
         else:
             raise ValueError(
                 "Must provide a value to observe if not using weight observer"
             )
 
-        quantization_scheme = getattr(module, "quantization_scheme", None)
-        arg_name = "weights" if base_name == "weight" else f"{base_name}_activations"
-        quant_args = getattr(quantization_scheme, arg_name, None)
-
-        # We always calculate quantizaton parameters by default and no global parameters
-        should_calculate_gparam = False
-        should_calculate_qparams = True
-
-        # TODO: will update to be the case for both weight and input in a follow-up
-        # weight global calculate is currently done in ct right now;
-        # should be moved here to unify global scale calculations
-        if (
-            quant_args.strategy == QuantizationStrategy.TENSOR_GROUP
-            and base_name == "input"
-        ):
-            should_calculate_gparam = True
-            should_calculate_qparams = False
-
         observer = getattr(module, f"{base_name}_observer")
-        observer_outputs = observer(
-            value,
-            g_idx=g_idx,
-            global_scale=global_scale,
-            should_calculate_gparam=should_calculate_gparam,
-        )
 
         if should_calculate_gparam:
-            updated_global_scale = observer_outputs
-            update_parameter_data(
-                module, updated_global_scale, f"{base_name}_global_scale"
+            global_scale = observer(
+                value,
+                should_calculate_gparam=True,
             )
+            update_parameter_data(module, global_scale, f"{base_name}_global_scale")
+        else:
+            global_scale = getattr(module, f"{base_name}_global_scale", None)
 
         if should_calculate_qparams:
-            # update scale and zero point
-            updated_scale, updated_zero_point = observer_outputs
+            updated_scale, updated_zero_point = observer(
+                value, g_idx=g_idx, global_scale=global_scale
+            )
             update_parameter_data(module, updated_scale, f"{base_name}_scale")
             update_parameter_data(module, updated_zero_point, f"{base_name}_zero_point")
 
 
+def update_weight_global_scale(module: Module):
+    if getattr_chain(module, "quantization_scheme.weights", None) is None:
+        return
+
+    if (
+        getattr_chain(module, "quantization_scheme.weights.strategy", None)
+        != QuantizationStrategy.TENSOR_GROUP
+    ):
+        return
+
+    call_observer(
+        module,
+        base_name="weight",
+        should_calculate_gparam=True,
+        should_calculate_qparams=False,
+    )
+    module.weight_observer.reset()
+
+
 def update_weight_zp_scale(module: Module):
     """
     marks a layer as ready for calibration which activates observers
@@ -165,10 +169,24 @@ def calibrate_activations(module: Module, value: torch.Tensor, base_name: str):
     if value.numel() == 0:
         return
 
+    quantization_scheme = getattr(module, "quantization_scheme", None)
+    quantization_args = getattr(quantization_scheme, f"{base_name}_activations", None)
+
+    calculate_qparams = True
+    calculate_gparam = False
+
+    if quantization_args is not None:
+        if quantization_args.dynamic in (True, DynamicType.LOCAL):
+            calculate_qparams = False
+        if quantization_args.strategy == QuantizationStrategy.TENSOR_GROUP:
+            calculate_gparam = True
+
     call_observer(
         module=module,
         base_name=base_name,
         value=value,
+        should_calculate_gparam=calculate_gparam,
+        should_calculate_qparams=calculate_qparams,
     )
 
 
diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py
@@ -2,8 +2,12 @@
 
 from llmcompressor.core import Event, EventType, State
 from llmcompressor.modifiers import Modifier
-from llmcompressor.modifiers.quantization.calibration import update_weight_zp_scale
+from llmcompressor.modifiers.quantization.calibration import (
+    update_weight_global_scale,
+    update_weight_zp_scale,
+)
 from llmcompressor.modifiers.quantization.quantization.mixin import QuantizationMixin
+from llmcompressor.modifiers.utils import update_fused_layer_weight_global_scales
 
 __all__ = ["QuantizationModifier"]
 
@@ -66,7 +70,14 @@ def on_start(self, state: State, event: Event, **kwargs):
         QuantizationMixin.start_calibration(self, state.model)
 
         modules = list(state.model.modules())
+        # TODO: this step can be combined with update_weight_zp_scale
+        # once update_fused_layer_weight_global_scales is removed
+        # and not required by vLLM
+        for module in tqdm.tqdm(modules):
+            update_weight_global_scale(module)
+
         for module in tqdm.tqdm(modules, desc="Calibrating weights"):
+            update_fused_layer_weight_global_scales(module)
             update_weight_zp_scale(module)
 
     def on_event(self, state: State, event: Event, **kwargs):
diff --git a/src/llmcompressor/modifiers/utils/__init__.py b/src/llmcompressor/modifiers/utils/__init__.py
@@ -1,3 +1,4 @@
 # flake8: noqa
 
 from .constants import *
+from .helpers import *
diff --git a/src/llmcompressor/modifiers/utils/helpers.py b/src/llmcompressor/modifiers/utils/helpers.py
@@ -0,0 +1,98 @@
+from typing import List
+
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+from compressed_tensors.utils import align_module_device, update_parameter_data
+from torch.nn import Linear, Module
+
+__all__ = ["update_fused_layer_weight_global_scales"]
+
+
+def update_fused_layer_weight_global_scales(submodule: torch.nn.Module):
+    """
+    When running NVFP4 quantization, update the global scale
+    such that q,k,v layers are treated as one tensor with the same
+    global_scale and gate_proj/up_proj layers are treated as one tensor
+    with the same global scale. This is requirement currently being set
+    by vLLM and may be removed in the future OR potentially make it
+    an optional step.
+
+    :param model: model to quantize
+    """
+
+    def _is_attention_module(module: Module):
+        return "attention" in module.__class__.__name__.lower() and (
+            hasattr(module, "k_proj")
+            or hasattr(module, "v_proj")
+            or hasattr(module, "qkv_proj")
+        )
+
+    def _is_mlp_module(module: Module):
+        return "mlp" in module.__class__.__name__.lower() and (
+            hasattr(module, "gate_proj") or hasattr(module, "up_proj")
+        )
+
+    def _valid_tensor_group_quant(layer_list: List[Linear]):
+        """
+        Return True if all the linear layers in the layer_list are
+        TENSOR_GROUP quantized.
+        """
+        for layer in layer_list:
+            scheme = getattr(layer, "quantization_scheme", None)
+            if scheme is None:
+                return False
+
+            weight_quant_args = scheme.weights
+
+            if weight_quant_args is None:
+                return False
+
+            if weight_quant_args.strategy != QuantizationStrategy.TENSOR_GROUP:
+                return False
+        return True
+
+    with align_module_device(submodule):
+        if _is_attention_module(submodule):
+            # already fused/treated as one layer
+            if hasattr(submodule, "qkv_proj"):
+                return
+
+            if not _valid_tensor_group_quant(
+                [submodule.q_proj, submodule.v_proj, submodule.k_proj]
+            ):
+                return
+
+            global_scale = torch.min(
+                torch.cat(
+                    (
+                        submodule.q_proj.weight_global_scale.data,
+                        submodule.k_proj.weight_global_scale.data,
+                        submodule.v_proj.weight_global_scale.data,
+                    )
+                )
+            )
+
+            update_parameter_data(submodule.q_proj, global_scale, "weight_global_scale")
+            update_parameter_data(submodule.k_proj, global_scale, "weight_global_scale")
+            update_parameter_data(submodule.v_proj, global_scale, "weight_global_scale")
+
+    with align_module_device(submodule):
+        if _is_mlp_module(submodule):
+            if not _valid_tensor_group_quant([submodule.gate_proj, submodule.up_proj]):
+                return
+
+            global_scale = torch.min(
+                torch.cat(
+                    (
+                        submodule.gate_proj.weight_global_scale.data,
+                        submodule.up_proj.weight_global_scale.data,
+                    )
+                )
+            )
+
+            update_parameter_data(
+                submodule.gate_proj, global_scale, "weight_global_scale"
+            )
+            update_parameter_data(
+                submodule.up_proj, global_scale, "weight_global_scale"
+            )
diff --git a/src/llmcompressor/observers/helpers.py b/src/llmcompressor/observers/helpers.py
@@ -1,14 +1,8 @@
 from collections import Counter
-from typing import Optional
 
 import torch
-from compressed_tensors.quantization.quant_args import (
-    FP4_E2M1_DATA,
-    FP8_E4M3_DATA,
-    FloatArgs,
-)
 
-__all__ = ["get_observer_token_count", "generate_gparam"]
+__all__ = ["get_observer_token_count"]
 
 
 def get_observer_token_count(module: torch.nn.Module) -> Counter:
@@ -26,29 +20,3 @@ def get_observer_token_count(module: torch.nn.Module) -> Counter:
                 module._num_observed_tokens
             )
     return token_counts
-
-
-# TODO: we have a similar function in ct already
-# consolidate when adding weight global scale
-# generation
-def generate_gparam(
-    updated_min_val: torch.Tensor,
-    updated_max_val: torch.Tensor,
-    scale_data: Optional[FloatArgs] = FP8_E4M3_DATA,
-    quant_data: Optional[FloatArgs] = FP4_E2M1_DATA,
-    dtype: Optional[torch.dtype] = torch.float32,
-):
-    """
-    Generate a global scale for an entire tensor (input_tensor).
-    Goal of the scale is to ensure that the quantization (local) scale
-    falls into the approproiate dtype range.
-
-    E.g. for NVFP4, group (local) scales are in dtype FP8. The global_scale
-    attempts to use the entire FP8 dtype range while mapping a per-group max
-    to the FP4 max.
-    """
-    min_vals = torch.min(updated_min_val, torch.zeros_like(updated_min_val))
-    max_vals = torch.max(updated_max_val, torch.zeros_like(updated_max_val))
-    max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
-    global_scale = scale_data.max * quant_data.max / max_val_pos
-    return global_scale.to(dtype)
diff --git a/src/llmcompressor/observers/min_max.py b/src/llmcompressor/observers/min_max.py
@@ -2,11 +2,10 @@
 
 import torch
 from compressed_tensors.quantization.quant_args import QuantizationArgs
-from compressed_tensors.quantization.utils import calculate_qparams
+from compressed_tensors.quantization.utils import calculate_qparams, generate_gparam
 from compressed_tensors.utils import deprecated
 
 from llmcompressor.observers.base import Observer
-from llmcompressor.observers.helpers import generate_gparam
 
 __all__ = ["MinMaxObserver", "MovingAverageMinMaxObserver"]
 

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
`1`	`1`	`# flake8: noqa`
`2`	`2`
`3`	`3`	`from .constants import *`
	`4`	`+from .helpers import *`