update NVFP4 data type; add scheme

dsikka · dsikka · commit d49830d43f60 · 2025-04-01T18:49:02.000Z
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -22,7 +22,7 @@
     wrap_module_forward_quantized,
 )
 from compressed_tensors.quantization.quant_args import (
-    FP4_NVFP4_DATA,
+    FP4_E2M1_DATA,
     FP8_E4M3_DATA,
     ActivationOrdering,
     QuantizationArgs,
@@ -167,19 +167,17 @@ def _initialize_scale_zero_point(
 
     # NVFP4 support; use FP8 scales
     # For weight quant, attach global scales for NVFP4
-    # TODO: How do we know if we need a global scale?
     # TODO: NVFP4 Scheme
     if (
-        base_name == "weight"
-        and quantization_args.num_bits == 4
+        quantization_args.num_bits == 4
         and quantization_args.type == QuantizationType.FLOAT
     ):
         scale_dtype = FP8_E4M3_DATA.dtype
         # create and attach nvfp4 data
         tensor_amax = torch.abs(module.weight.data).max().to(torch.float32)
         # Setting data for now - could possibly be handled later in the pipeline
-        value = FP8_E4M3_DATA.max * FP4_NVFP4_DATA.max / tensor_amax
-        # use the weight dtype (bfloat) maybe use float32 to start?
+        value = FP8_E4M3_DATA.max * FP4_E2M1_DATA.max / tensor_amax
+        # TODO: use model.weight.dtype
         value = value.to(torch.float32).to(device)
         # Assuming the global scale can be torch.float16/bfloat16/module weight dtype and not only torch.float32?
         init_global_scale = Parameter(value, requires_grad=False)
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -26,7 +26,7 @@
 __all__ = [
     "FP8_DTYPE",
     "FP8_E4M3_DATA",
-    "FP4_NVFP4_DATA",
+    "FP4_E2M1_DATA",
     "QuantizationType",
     "QuantizationStrategy",
     "QuantizationArgs",
@@ -56,8 +56,7 @@ class FloatArgs:
     min=torch.finfo(torch.float8_e4m3fn).min,
     dtype=torch.float8_e4m3fn,
 )
-# Don't call NVFP4; should be based on exponent and mantissa
-FP4_NVFP4_DATA = FloatArgs(exponent=2, mantissa=1, bits=4, max=6.0, min=-6.0)
+FP4_E2M1_DATA = FloatArgs(exponent=2, mantissa=1, bits=4, max=6.0, min=-6.0)
 
 
 class QuantizationType(str, Enum):
diff --git a/src/compressed_tensors/quantization/quant_scheme.py b/src/compressed_tensors/quantization/quant_scheme.py
@@ -100,6 +100,17 @@ def is_preset_scheme(name: str) -> bool:
 
 UNQUANTIZED = dict()
 
+NVFP4 = dict(
+    weights=QuantizationArgs(
+        num_bits=4,
+        type=QuantizationType.FLOAT,
+        strategy=QuantizationStrategy.GROUP,
+        symmetric=True,
+        dynamic=False,
+        group_size=16,
+    )
+)
+
 # 8 bit integer weights and 8 bit activations quantization
 INT8_W8A8 = dict(
     weights=QuantizationArgs(
@@ -212,4 +223,5 @@ def is_preset_scheme(name: str) -> bool:
     # Float weight and activation schemes
     "FP8": FP8,
     "FP8_DYNAMIC": FP8_DYNAMIC,
+    "NVFP4": NVFP4,
 }
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -17,7 +17,7 @@
 
 import torch
 from compressed_tensors.quantization.quant_args import (
-    FP4_NVFP4_DATA,
+    FP4_E2M1_DATA,
     FP8_E4M3_DATA,
     QuantizationArgs,
     QuantizationStrategy,
@@ -85,11 +85,7 @@ def calculate_qparams(
             and quantization_args.type == QuantizationType.FLOAT
         ):
             assert global_scale is not None
-            # TODO: how do we pass in the global scale?
-            # An observer is attached per module, we can conditionally pass in
-            # the global scale --> TODO: check for presence of the global when updating the scale
-            # TODO: maybe remove FP8 scale cast
-            scale = max_val_pos / FP4_NVFP4_DATA.max
+            scale = max_val_pos / FP4_E2M1_DATA.max  # Not needed
             scale = scale / global_scale
             scale = scale.to(FP8_E4M3_DATA.dtype)  # .to(torch.float32)
 
@@ -166,8 +162,8 @@ def calculate_range(quantization_args: QuantizationArgs, device: str) -> Tuple:
         else:
             # nvfp4 ranges
             assert quantization_args.num_bits == 4
-            q_max = torch.tensor(FP4_NVFP4_DATA.max, device=device)
-            q_min = torch.tensor(FP4_NVFP4_DATA.min, device=device)
+            q_max = torch.tensor(FP4_E2M1_DATA.max, device=device)
+            q_min = torch.tensor(FP4_E2M1_DATA.min, device=device)
     else:
         raise ValueError(f"Invalid quantization type {quantization_args.type}")