initial commit

dsikka · dsikka · commit d22a1377e295 · 2025-04-01T18:49:02.000Z
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -20,6 +20,7 @@
 from compressed_tensors.quantization.quant_args import (
     QuantizationArgs,
     QuantizationStrategy,
+    QuantizationType,
     round_to_quantized_type,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
@@ -359,18 +360,22 @@ def _quantize(
     dtype: Optional[torch.dtype] = None,
 ) -> torch.Tensor:
 
-    scaled = x / scale
-    if zero_point is not None:
-        scaled += zero_point.to(x.dtype)
-    # clamp first because cast isn't guaranteed to be saturated (ie for fp8)
-    clamped_value = torch.clamp(
-        scaled,
-        q_min,
-        q_max,
-    )
-    quantized_value = round_to_quantized_type(clamped_value, args)
-    if dtype is not None:
-        quantized_value = quantized_value.to(dtype)
+    if args.num_bits == 4 and args.type == QuantizationType.FLOAT:
+        # apply fp4 quant
+        return quantized_value
+    else:
+        scaled = x / scale
+        if zero_point is not None:
+            scaled += zero_point.to(x.dtype)
+        # clamp first because cast isn't guaranteed to be saturated (ie for fp8)
+        clamped_value = torch.clamp(
+            scaled,
+            q_min,
+            q_max,
+        )
+        quantized_value = round_to_quantized_type(clamped_value, args)
+        if dtype is not None:
+            quantized_value = quantized_value.to(dtype)
 
     return quantized_value
 
@@ -382,13 +387,18 @@ def _dequantize(
     zero_point: torch.Tensor = None,
     dtype: Optional[torch.dtype] = None,
 ) -> torch.Tensor:
-    dequant_value = x_q.to(scale.dtype)
 
-    if zero_point is not None:
-        dequant_value = dequant_value - zero_point.to(scale.dtype)
-    dequant_value = dequant_value * scale
+    if args.num_bits == 4 and args.type == QuantizationType.FLOAT:
+        # apply fp4 deqquant
+        dequant_value = None
+    else:
+        dequant_value = x_q.to(scale.dtype)
+
+        if zero_point is not None:
+            dequant_value = dequant_value - zero_point.to(scale.dtype)
+        dequant_value = dequant_value * scale
 
-    if dtype is not None:
-        dequant_value = dequant_value.to(dtype)
+        if dtype is not None:
+            dequant_value = dequant_value.to(dtype)
 
     return dequant_value
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -30,6 +30,8 @@
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme
 from compressed_tensors.utils import (
+    FP4_NVFP4_DATA,
+    FP8_E4M3_DATA,
     disable_hf_hook,
     has_offloaded_params,
     register_offload_parameter,
@@ -161,7 +163,33 @@ def _initialize_scale_zero_point(
             expected_shape = (weight_shape[0], max(num_groups, 1))
 
     scale_dtype = module.weight.dtype
-    if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32]:
+
+    # NVFP4 support; use FP8 scales
+    # For weight quant, attach global scales for NVFP4
+    if (
+        base_name == "weight"
+        and quantization_args.num_bits == 4
+        and quantization_args.strategy == QuantizationStrategy.FLOAT
+    ):
+        scale_dtype = FP8_E4M3_DATA.dtype
+        # create and attach nvfp4 data
+        tensor_amax = torch.abs(module.weight.data).max().to(torch.float32)
+        # Setting data for now - could possibly be handled later in the pipeline
+        values = FP8_E4M3_DATA.max * FP4_NVFP4_DATA.max / tensor_amax
+        # Assuming the global scale can be torch.float16/bfloat16/module weight dtype and not only torch.float32?
+        init_global_scale = Parameter(
+            value, dtype=torch.float32, device=device, requires_grad=False
+        )
+        register_offload_parameter(
+            module, f"f{base_name}_global_scale", init_global_scale
+        )
+
+    if scale_dtype not in [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+        FP8_DATA.dtype,
+    ]:
         scale_dtype = torch.float16
 
     # initializes empty scale, zero point, and g_idx parameters for the module
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import warnings
+from dataclasses import dataclass
 from enum import Enum
 from typing import Any, Dict, Optional, Union
 
@@ -24,15 +25,38 @@
 
 __all__ = [
     "FP8_DTYPE",
+    "FP8_E4M3_DATA",
+    "FP4_NVFP4_DATA",
     "QuantizationType",
     "QuantizationStrategy",
     "QuantizationArgs",
     "round_to_quantized_type",
     "ActivationOrdering",
 ]
 
+# TODO: Remove soon in favour of a more descriptive FloatArgs
 FP8_DTYPE = torch.float8_e4m3fn
 
+FP8_E4M3_DATA = FloatArgs(
+    exponent=4,
+    mantissa=3,
+    bits=8,
+    max=torch.finfo(torch.float8_e4m3fn).max,
+    min=torch.finfo(torch.float8_e4m3fn).min,
+    dtype=torch.float8_e4m3fn,
+)
+FP4_NVFP4_DATA = FloatArgs(exponent=2, mantissa=1, bits=4, max=6.0, min=-6.0)
+
+
+@dataclass
+class FloatArgs:
+    exponent: int
+    mantissa: int
+    bits: int
+    max: float
+    min: float
+    dtype: Optional[torch.dtype] = None
+
 
 class QuantizationType(str, Enum):
     """
@@ -233,6 +257,8 @@ def validate_model_after(model: "QuantizationArgs") -> Dict[str, Any]:
         return model
 
     def pytorch_dtype(self) -> torch.dtype:
+        # TODO: required for the compressor
+        # Add FP4_nvfp4 type when updating naive_compressor
         if self.type == QuantizationType.FLOAT:
             return FP8_DTYPE
         elif self.type == QuantizationType.INT:
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -17,7 +17,8 @@
 
 import torch
 from compressed_tensors.quantization.quant_args import (
-    FP8_DTYPE,
+    FP4_NVFP4_DATA,
+    FP8_E4M3_DATA,
     QuantizationArgs,
     QuantizationStrategy,
     QuantizationType,
@@ -73,6 +74,7 @@ def calculate_qparams(
     zp_dtype = quantization_args.pytorch_dtype()
 
     if quantization_args.symmetric:
+        # TODO: update for NVFP4 when applying observers
         max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
         scales = max_val_pos / (float(bit_range) / 2)
         scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
@@ -138,14 +140,21 @@ def calculate_range(quantization_args: QuantizationArgs, device: str) -> Tuple:
         q_max = torch.tensor(bit_range / 2 - 1, device=device)
         q_min = torch.tensor(-bit_range / 2, device=device)
     elif quantization_args.type == QuantizationType.FLOAT:
-        if quantization_args.num_bits != 8:
-            raise ValueError(
-                "Floating point quantization is only supported for 8 bits,"
-                f"got {quantization_args.num_bits}"
-            )
-        fp_range_info = torch.finfo(FP8_DTYPE)
-        q_max = torch.tensor(fp_range_info.max, device=device)
-        q_min = torch.tensor(fp_range_info.min, device=device)
+        if quantization_args.num_bits == 8:
+            """
+            if quantization_args.num_bits != 8:
+                raise ValueError(
+                    "Floating point quantization is only supported for 8 bits,"
+                    f"got {quantization_args.num_bits}"
+                )
+            """
+            q_max = torch.tensor(FP8_E4M3_DATA.max, device=device)
+            q_min = torch.tensor(FP8_E4M3_DATA.min, device=device)
+        else:
+            # nvfp4 ranges
+            assert quantization_args.num_bits == 4
+            q_max = torch.tensor(FP4_NVFP4_DATA.max, device=device)
+            q_min = torch.tensor(FP4_NVFP4_DATA.min, device=device)
     else:
         raise ValueError(f"Invalid quantization type {quantization_args.type}")