update

dsikka · dsikka · commit be02849b8e86 · 2025-04-01T18:49:02.000Z
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -360,22 +360,18 @@ def _quantize(
     dtype: Optional[torch.dtype] = None,
 ) -> torch.Tensor:
 
-    if args.num_bits == 4 and args.type == QuantizationType.FLOAT:
-        # apply fp4 quant
-        return quantized_value
-    else:
-        scaled = x / scale
-        if zero_point is not None:
-            scaled += zero_point.to(x.dtype)
-        # clamp first because cast isn't guaranteed to be saturated (ie for fp8)
-        clamped_value = torch.clamp(
-            scaled,
-            q_min,
-            q_max,
-        )
-        quantized_value = round_to_quantized_type(clamped_value, args)
-        if dtype is not None:
-            quantized_value = quantized_value.to(dtype)
+    scaled = x / scale
+    if zero_point is not None:
+        scaled += zero_point.to(x.dtype)
+    # clamp first because cast isn't guaranteed to be saturated (ie for fp8)
+    clamped_value = torch.clamp(
+        scaled,
+        q_min,
+        q_max,
+    )
+    quantized_value = round_to_quantized_type(clamped_value, args)
+    if dtype is not None:
+        quantized_value = quantized_value.to(dtype)
 
     return quantized_value
 
@@ -388,17 +384,13 @@ def _dequantize(
     dtype: Optional[torch.dtype] = None,
 ) -> torch.Tensor:
 
-    if args.num_bits == 4 and args.type == QuantizationType.FLOAT:
-        # apply fp4 deqquant
-        dequant_value = None
-    else:
-        dequant_value = x_q.to(scale.dtype)
+    dequant_value = x_q.to(scale.dtype)
 
-        if zero_point is not None:
-            dequant_value = dequant_value - zero_point.to(scale.dtype)
-        dequant_value = dequant_value * scale
+    if zero_point is not None:
+        dequant_value = dequant_value - zero_point.to(scale.dtype)
+    dequant_value = dequant_value * scale
 
-        if dtype is not None:
-            dequant_value = dequant_value.to(dtype)
+    if dtype is not None:
+        dequant_value = dequant_value.to(dtype)
 
     return dequant_value
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -25,6 +25,7 @@
     ActivationOrdering,
     QuantizationArgs,
     QuantizationStrategy,
+    QuantizationType,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
@@ -169,7 +170,7 @@ def _initialize_scale_zero_point(
     if (
         base_name == "weight"
         and quantization_args.num_bits == 4
-        and quantization_args.strategy == QuantizationStrategy.FLOAT
+        and quantization_args.type == QuantizationType.FLOAT
     ):
         scale_dtype = FP8_E4M3_DATA.dtype
         # create and attach nvfp4 data
@@ -188,7 +189,7 @@ def _initialize_scale_zero_point(
         torch.float16,
         torch.bfloat16,
         torch.float32,
-        FP8_DATA.dtype,
+        FP8_E4M3_DATA.dtype,
     ]:
         scale_dtype = torch.float16
 
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -289,7 +289,11 @@ def round_to_quantized_type(
     """
     original_dtype = tensor.dtype
     if args.type == QuantizationType.FLOAT:
-        rounded = tensor.to(FP8_DTYPE)
+        if args.num_bits == 8:
+            rounded = tensor.to(FP8_E4M3_DATA.dtype)
+        elif args.num_bits == 4:
+            # TODO: cast to whatever value we want fp4 to be post quantization/clamping
+            rounded = tensor.to()
     elif args.type == QuantizationType.INT:
         rounded = torch.round(tensor)
     else:
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -76,7 +76,22 @@ def calculate_qparams(
     if quantization_args.symmetric:
         # TODO: update for NVFP4 when applying observers
         max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
-        scales = max_val_pos / (float(bit_range) / 2)
+
+        if (
+            quantization_args.num_bits == 4
+            and quantization_args.type == QuantizationType.FLOAT
+        ):
+            # TODO: how do we pass in the global scale?
+            # An observer is attached per module, we can conditionally pass in
+            # the global scale
+            scale = global_scale * (max_val_pos / FP4_NVFP4_DATA.max)
+            scale = scale.to(FP8_E4M3_DATA.dtype).to(torch.float32)
+            scale = scale / global_scale
+        else:
+            # Divide over bit range over max value?
+            scales = max_val_pos / (float(bit_range) / 2)
+
+        # needed for fp4?
         scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
         zero_points = torch.zeros(scales.shape, device=device, dtype=min_vals.dtype)
     else:
@@ -141,13 +156,6 @@ def calculate_range(quantization_args: QuantizationArgs, device: str) -> Tuple:
         q_min = torch.tensor(-bit_range / 2, device=device)
     elif quantization_args.type == QuantizationType.FLOAT:
         if quantization_args.num_bits == 8:
-            """
-            if quantization_args.num_bits != 8:
-                raise ValueError(
-                    "Floating point quantization is only supported for 8 bits,"
-                    f"got {quantization_args.num_bits}"
-                )
-            """
             q_max = torch.tensor(FP8_E4M3_DATA.max, device=device)
             q_min = torch.tensor(FP8_E4M3_DATA.min, device=device)
         else: