neuralmagic
diff --git a/‎src/compressed_tensors/compressors/quantized_compressors/base.py
Lines changed: 2 additions & 0 deletions b/‎src/compressed_tensors/compressors/quantized_compressors/base.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py
Lines changed: 6 additions & 0 deletions b/‎src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py
Lines changed: 6 additions & 0 deletions b/‎src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/compressed_tensors/quantization/lifecycle/apply.py
Lines changed: 10 additions & 1 deletion b/‎src/compressed_tensors/quantization/lifecycle/apply.py
Lines changed: 10 additions & 1 deletion
diff --git a/‎src/compressed_tensors/quantization/lifecycle/forward.py
Lines changed: 50 additions & 14 deletions b/‎src/compressed_tensors/quantization/lifecycle/forward.py
Lines changed: 50 additions & 14 deletions
@@ -99,6 +99,7 @@ def compress(
                 scale = model_state.get(prefix + "weight_scale", None)
                 g_idx = model_state.get(prefix + "weight_g_idx", None)
                 zp = model_state.get(prefix + "weight_zero_point", None)
+                global_scale = model_state.get(prefix + "weight_global_scale", None)
 
                 # is scale does not exist, then weight cannot be compressed
                 if scale is None:
@@ -112,6 +113,7 @@ def compress(
                     weight=value,
                     scale=scale,
                     zero_point=zp,
+                    global_scale=global_scale,
                     g_idx=g_idx,
                     quantization_args=quant_args,
                     device="cpu",
 
@@ -78,6 +78,7 @@ def compress_weight(
         zero_point: Optional[Tensor] = None,
         g_idx: Optional[torch.Tensor] = None,
         device: Optional[torch.device] = None,
+        global_scale: Optional[torch.Tensor] = None,
     ) -> Dict[str, torch.Tensor]:
         """
         Compresses a single uncompressed weight
@@ -90,6 +91,11 @@ def compress_weight(
         :param device: optional device to move compressed output to
         :return: dictionary of compressed weight data
         """
+        if global_scale is not None:
+            raise ValueError(
+                "global_scale is not supported for the NaiveQuantizationCompressor"
+            )
+
         if can_quantize(weight, quantization_args):
             quantized_weight = quantize(
                 x=weight,
 
@@ -94,6 +94,7 @@ def compress_weight(
         zero_point: Optional[Tensor] = None,
         g_idx: Optional[torch.Tensor] = None,
         device: Optional[torch.device] = None,
+        global_scale: Optional[torch.Tensor] = None,
     ) -> Dict[str, torch.Tensor]:
         """
         Compresses a single uncompressed weight
@@ -106,6 +107,11 @@ def compress_weight(
         :param device: optional device to move compressed output to
         :return: dictionary of compressed weight data
         """
+        if global_scale is not None:
+            raise ValueError(
+                "global_scale is not supported for the PackQuantizationCompressor"
+            )
+
         compressed_dict = {}
         if can_quantize(weight, quantization_args):
             quantized_weight = quantize(
 
@@ -27,8 +27,14 @@
 )
 from compressed_tensors.quantization.lifecycle.initialize import (
     initialize_module_for_quantization,
+    update_fused_layer_weight_global_scales,
+)
+from compressed_tensors.quantization.quant_args import (
+    FP4_E2M1_DATA,
+    FP8_E4M3_DATA,
+    QuantizationArgs,
+    QuantizationType,
 )
-from compressed_tensors.quantization.quant_args import QuantizationArgs
 from compressed_tensors.quantization.quant_config import (
     QuantizationConfig,
     QuantizationStatus,
@@ -266,6 +272,9 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
             )
         )
 
+        if status == QuantizationStatus.INITIALIZED:
+            update_fused_layer_weight_global_scales(model)
+
     if current_status < status >= QuantizationStatus.COMPRESSED > current_status:
         model.apply(compress_quantized_weights)
 
 
@@ -20,6 +20,7 @@
 from compressed_tensors.quantization.quant_args import (
     QuantizationArgs,
     QuantizationStrategy,
+    QuantizationType,
     round_to_quantized_type,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
@@ -49,6 +50,7 @@ def quantize(
     args: QuantizationArgs,
     dtype: Optional[torch.dtype] = None,
     g_idx: Optional[torch.Tensor] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Quantize the input tensor x using the QuantizationStrategy specified in args.
@@ -63,6 +65,7 @@ def quantize(
     :param args: quantization args dictating how to quantize x
     :param dtype: optional dtype to cast the quantized output to
     :param g_idx: optional mapping from column index to group index
+    :param global_scale: optional constant to scale the quantization scale during QDQ
     :return: fake quantized tensor
     """
 
@@ -75,6 +78,7 @@ def quantize(
         do_quantize=True,
         do_dequantize=False,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
 
 
@@ -86,6 +90,7 @@ def dequantize(
     args: Optional[QuantizationArgs] = None,
     dtype: Optional[torch.dtype] = None,
     g_idx: Optional[torch.Tensor] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Dequantize a quantized input tensor x_q based on the strategy specified in args. If
@@ -97,6 +102,7 @@ def dequantize(
     :param args: quantization args used to quantize x_q
     :param dtype: optional dtype to cast the dequantized output to
     :param g_idx: optional mapping from column index to group index
+    :param global_scale: optional constant to scale the quantization scale during QDQ
     :return: dequantized float tensor
     """
     if args is None:
@@ -128,6 +134,7 @@ def dequantize(
         do_dequantize=True,
         dtype=dtype,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
 
 
@@ -138,6 +145,7 @@ def fake_quantize(
     zero_point: torch.Tensor,
     args: QuantizationArgs,
     g_idx: Optional[torch.Tensor] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Fake quantize the input tensor x by quantizing then dequantizing with
@@ -151,6 +159,7 @@ def fake_quantize(
     :param zero_point: zero point tensor
     :param args: quantization args dictating how to quantize x
     :param g_idx: optional mapping from column index to group index
+    :param global_scale: optional constant to scale the quantization scale during QDQ
     :return: fake quantized tensor
     """
     return _process_quantization(
@@ -161,6 +170,7 @@ def fake_quantize(
         do_quantize=True,
         do_dequantize=True,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
 
 
@@ -174,6 +184,7 @@ def _process_quantization(
     dtype: Optional[torch.dtype] = None,
     do_quantize: bool = True,
     do_dequantize: bool = True,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     q_min, q_max = calculate_range(args, x.device)
     group_size = args.group_size
@@ -221,35 +232,44 @@ def _process_quantization(
             end = start + group_count
             if do_quantize:
                 output[:, start:end] = _quantize(
-                    x[:, start:end],
-                    sc,
-                    zp,
-                    q_min,
-                    q_max,
-                    args,
+                    x=x[:, start:end],
+                    scale=sc,
+                    zero_point=zp,
+                    q_min=q_min,
+                    q_max=q_max,
+                    args=args,
                     dtype=dtype,
+                    global_scale=global_scale,
                 )
 
             if do_dequantize:
                 input = output[:, start:end] if do_quantize else x[:, start:end]
-                output[:, start:end] = _dequantize(input, sc, zp)
+                output[:, start:end] = _dequantize(
+                    x_q=input, scale=sc, zero_point=zp, global_scale=global_scale
+                )
 
         if not is_column_order:
             output = safe_permute(output, torch.argsort(perm), dim=1)
 
     else:  # covers channel, token and tensor strategies
         if do_quantize:
             output = _quantize(
-                x,
-                scale,
-                zero_point,
-                q_min,
-                q_max,
-                args,
+                x=x,
+                scale=scale,
+                zero_point=zero_point,
+                q_min=q_min,
+                q_max=q_max,
+                args=args,
                 dtype=dtype,
+                global_scale=global_scale,
             )
         if do_dequantize:
-            output = _dequantize(output if do_quantize else x, scale, zero_point)
+            output = _dequantize(
+                output if do_quantize else x,
+                scale=scale,
+                zero_point=zero_point,
+                global_scale=global_scale,
+            )
 
     return output
 
@@ -330,6 +350,7 @@ def forward_quantize(
         return value
 
     g_idx = getattr(module, "weight_g_idx", None)
+    global_scale = getattr(module, f"{base_name}_global_scale", None)
 
     if args.dynamic:
         # dynamic quantization - determine the scale/zp on the fly
@@ -345,6 +366,7 @@ def forward_quantize(
         zero_point=zero_point,
         args=args,
         g_idx=g_idx,
+        global_scale=global_scale,
     )
 
 
@@ -357,11 +379,18 @@ def _quantize(
     q_max: torch.Tensor,
     args: QuantizationArgs,
     dtype: Optional[torch.dtype] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
 
+    # if a global scale is optionally provided, use it
+    # to further scale the local `scale` parameter
+    if global_scale:
+        scale = scale.to(global_scale.dtype) / global_scale
+
     scaled = x / scale
     if zero_point is not None:
         scaled += zero_point.to(x.dtype)
+
     # clamp first because cast isn't guaranteed to be saturated (ie for fp8)
     clamped_value = torch.clamp(
         scaled,
@@ -381,7 +410,14 @@ def _dequantize(
     scale: torch.Tensor,
     zero_point: torch.Tensor = None,
     dtype: Optional[torch.dtype] = None,
+    global_scale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
+
+    # if a global scale is optionally provided, use it
+    # to further scale the local `scale` parameter
+    if global_scale:
+        scale = scale.to(global_scale.dtype) / global_scale
+
     dequant_value = x_q.to(scale.dtype)
 
     if zero_point is not None:
Original file line number	Diff line number	Diff line change
`@@ -27,8 +27,14 @@`
`27`	`27`	`)`
`28`	`28`	`from compressed_tensors.quantization.lifecycle.initialize import (`
`29`	`29`	`initialize_module_for_quantization,`
	`30`	`+ update_fused_layer_weight_global_scales,`
	`31`	`+)`
	`32`	`+from compressed_tensors.quantization.quant_args import (`
	`33`	`+ FP4_E2M1_DATA,`
	`34`	`+ FP8_E4M3_DATA,`
	`35`	`+ QuantizationArgs,`
	`36`	`+ QuantizationType,`
`30`	`37`	`)`
`31`		`-from compressed_tensors.quantization.quant_args import QuantizationArgs`
`32`	`38`	`from compressed_tensors.quantization.quant_config import (`
`33`	`39`	`QuantizationConfig,`
`34`	`40`	`QuantizationStatus,`
`@@ -266,6 +272,9 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):`
`266`	`272`	`)`
`267`	`273`	`)`
`268`	`274`
	`275`	`+ if status == QuantizationStatus.INITIALIZED:`
	`276`	`+ update_fused_layer_weight_global_scales(model)`
	`277`	`+`
`269`	`278`	`if current_status < status >= QuantizationStatus.COMPRESSED > current_status:`
`270`	`279`	`model.apply(compress_quantized_weights)`
`271`	`280`