update quant/dequant steps; update scale calculation step

dsikka · dsikka · commit 36204f0f3e7a · 2025-04-01T18:49:02.000Z
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -50,6 +50,7 @@ def quantize(
     args: QuantizationArgs,
     dtype: Optional[torch.dtype] = None,
     g_idx: Optional[torch.Tensor] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Quantize the input tensor x using the QuantizationStrategy specified in args.
@@ -76,6 +77,7 @@ def quantize(
         do_quantize=True,
         do_dequantize=False,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
 
 
@@ -87,6 +89,7 @@ def dequantize(
     args: Optional[QuantizationArgs] = None,
     dtype: Optional[torch.dtype] = None,
     g_idx: Optional[torch.Tensor] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Dequantize a quantized input tensor x_q based on the strategy specified in args. If
@@ -129,6 +132,7 @@ def dequantize(
         do_dequantize=True,
         dtype=dtype,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
 
 
@@ -139,6 +143,7 @@ def fake_quantize(
     zero_point: torch.Tensor,
     args: QuantizationArgs,
     g_idx: Optional[torch.Tensor] = None,
+    global_scale: Optiona[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Fake quantize the input tensor x by quantizing then dequantizing with
@@ -162,6 +167,7 @@ def fake_quantize(
         do_quantize=True,
         do_dequantize=True,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
 
 
@@ -175,6 +181,7 @@ def _process_quantization(
     dtype: Optional[torch.dtype] = None,
     do_quantize: bool = True,
     do_dequantize: bool = True,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     q_min, q_max = calculate_range(args, x.device)
     group_size = args.group_size
@@ -222,35 +229,44 @@ def _process_quantization(
             end = start + group_count
             if do_quantize:
                 output[:, start:end] = _quantize(
-                    x[:, start:end],
-                    sc,
-                    zp,
-                    q_min,
-                    q_max,
-                    args,
+                    x=x[:, start:end],
+                    scale=sc,
+                    zero_point=zp,
+                    q_min=q_min,
+                    q_max=q_max,
+                    args=args,
                     dtype=dtype,
+                    global_scale=global_scale,
                 )
 
             if do_dequantize:
                 input = output[:, start:end] if do_quantize else x[:, start:end]
-                output[:, start:end] = _dequantize(input, sc, zp)
+                output[:, start:end] = _dequantize(
+                    x=input, scale=sc, zero_point=zp, global_scale=global_scale
+                )
 
         if not is_column_order:
             output = safe_permute(output, torch.argsort(perm), dim=1)
 
     else:  # covers channel, token and tensor strategies
         if do_quantize:
             output = _quantize(
-                x,
-                scale,
-                zero_point,
-                q_min,
-                q_max,
-                args,
+                x=x,
+                scale=scale,
+                zero_point=zero_point,
+                q_min=q_min,
+                q_max=q_max,
+                args=args,
                 dtype=dtype,
+                global_scale=global_scale,
             )
         if do_dequantize:
-            output = _dequantize(output if do_quantize else x, scale, zero_point)
+            output = _dequantize(
+                output if do_quantize else x,
+                scale=scale,
+                zero_point=zero_point,
+                global_scale=global_scale,
+            )
 
     return output
 
@@ -331,6 +347,7 @@ def forward_quantize(
         return value
 
     g_idx = getattr(module, "weight_g_idx", None)
+    global_scale = getattr(module, f"{base_name}_global_scale", None)
 
     if args.dynamic:
         # dynamic quantization - determine the scale/zp on the fly
@@ -346,6 +363,7 @@ def forward_quantize(
         zero_point=zero_point,
         args=args,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
 
 
@@ -358,11 +376,16 @@ def _quantize(
     q_max: torch.Tensor,
     args: QuantizationArgs,
     dtype: Optional[torch.dtype] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
 
+    if global_scale:
+        scale = scale.to(global_scale.dtype) * global_scale
+
     scaled = x / scale
     if zero_point is not None:
         scaled += zero_point.to(x.dtype)
+
     # clamp first because cast isn't guaranteed to be saturated (ie for fp8)
     clamped_value = torch.clamp(
         scaled,
@@ -382,8 +405,12 @@ def _dequantize(
     scale: torch.Tensor,
     zero_point: torch.Tensor = None,
     dtype: Optional[torch.dtype] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
 
+    if global_scale:
+        scale = scale.to(global_scale.dtype) * global_scale
+
     dequant_value = x_q.to(scale.dtype)
 
     if zero_point is not None:
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -167,6 +167,8 @@ def _initialize_scale_zero_point(
 
     # NVFP4 support; use FP8 scales
     # For weight quant, attach global scales for NVFP4
+    # TODO: How do we know if we need a global scale?
+    # TODO: NVFP4 Scheme
     if (
         base_name == "weight"
         and quantization_args.num_bits == 4
@@ -177,6 +179,7 @@ def _initialize_scale_zero_point(
         tensor_amax = torch.abs(module.weight.data).max().to(torch.float32)
         # Setting data for now - could possibly be handled later in the pipeline
         value = FP8_E4M3_DATA.max * FP4_NVFP4_DATA.max / tensor_amax
+        # use the weight dtype (bfloat) maybe use float32 to start?
         value = value.to(torch.float32).to(device)
         # Assuming the global scale can be torch.float16/bfloat16/module weight dtype and not only torch.float32?
         init_global_scale = Parameter(value, requires_grad=False)
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -56,6 +56,7 @@ class FloatArgs:
     min=torch.finfo(torch.float8_e4m3fn).min,
     dtype=torch.float8_e4m3fn,
 )
+# Don't call NVFP4; should be based on exponent and mantissa
 FP4_NVFP4_DATA = FloatArgs(exponent=2, mantissa=1, bits=4, max=6.0, min=-6.0)
 
 
@@ -265,6 +266,7 @@ def pytorch_dtype(self) -> torch.dtype:
                 return FP8_E4M3_DATA.dtype
             else:
                 assert self.num_bits == 4
+                # TODO: Use the look-up?
                 # TODO: will return None for now until updated in FloatArgs
                 return FP4_NVFP4_DATA.dtype
         elif self.type == QuantizationType.INT:
@@ -299,6 +301,7 @@ def round_to_quantized_type(
             rounded = tensor.to(FP8_E4M3_DATA.dtype)
         else:
             assert args.num_bits == 4
+            # TODO: Use the FP4_NVFP4_DATA class to use a look-up table
             # TODO: cast to whatever value we want fp4 to be post quantization/clamping
             rounded = tensor.to(FP4_NVFP4_DATA.dtype)
     elif args.type == QuantizationType.INT:
diff --git a/src/compressed_tensors/quantization/utils/helpers.py b/src/compressed_tensors/quantization/utils/helpers.py
@@ -55,7 +55,10 @@
 
 
 def calculate_qparams(
-    min_vals: Tensor, max_vals: Tensor, quantization_args: QuantizationArgs
+    min_vals: Tensor,
+    max_vals: Tensor,
+    quantization_args: QuantizationArgs,
+    global_scale: Optional[Tensor] = None,
 ) -> Tuple[FloatTensor, IntTensor]:
     """
     :param min_vals: tensor of min value(s) to calculate scale(s) and zero point(s)
@@ -81,17 +84,19 @@ def calculate_qparams(
             quantization_args.num_bits == 4
             and quantization_args.type == QuantizationType.FLOAT
         ):
+            assert global_scale is not None
             # TODO: how do we pass in the global scale?
             # An observer is attached per module, we can conditionally pass in
-            # the global scale
-            scale = global_scale * (max_val_pos / FP4_NVFP4_DATA.max)
-            scale = scale.to(FP8_E4M3_DATA.dtype).to(torch.float32)
+            # the global scale --> TODO: check for presence of the global when updating the scale
+            # TODO: maybe remove FP8 scale cast
+            scale = max_val_pos / FP4_NVFP4_DATA.max
             scale = scale / global_scale
+            scale = scale.to(FP8_E4M3_DATA.dtype)  # .to(torch.float32)
+
         else:
             # Divide over bit range over max value?
             scales = max_val_pos / (float(bit_range) / 2)
 
-        # needed for fp4?
         scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps)
         zero_points = torch.zeros(scales.shape, device=device, dtype=min_vals.dtype)
     else: