neuralmagic
diff --git a/‎src/compressed_tensors/compressors/model_compressors/model_compressor.py
Lines changed: 16 additions & 18 deletions b/‎src/compressed_tensors/compressors/model_compressors/model_compressor.py
Lines changed: 16 additions & 18 deletions
diff --git a/‎src/compressed_tensors/compressors/quantized_compressors/base.py
Lines changed: 30 additions & 3 deletions b/‎src/compressed_tensors/compressors/quantized_compressors/base.py
Lines changed: 30 additions & 3 deletions
diff --git a/‎src/compressed_tensors/quantization/lifecycle/apply.py
Lines changed: 1 addition & 10 deletions b/‎src/compressed_tensors/quantization/lifecycle/apply.py
Lines changed: 1 addition & 10 deletions
diff --git a/‎src/compressed_tensors/quantization/lifecycle/forward.py
Lines changed: 37 additions & 27 deletions b/‎src/compressed_tensors/quantization/lifecycle/forward.py
Lines changed: 37 additions & 27 deletions
diff --git a/‎src/compressed_tensors/quantization/lifecycle/initialize.py
Lines changed: 7 additions & 113 deletions b/‎src/compressed_tensors/quantization/lifecycle/initialize.py
Lines changed: 7 additions & 113 deletions
@@ -50,6 +50,7 @@
     align_module_device,
     delete_offload_parameter,
     get_execution_device,
+    get_offloaded_device,
     get_safetensors_folder,
     has_offloaded_params,
     merge_names,
@@ -408,16 +409,17 @@ def compress_model(self, model: Module):
                     )
 
                 # remove any existing parameters
-                device = get_execution_device(module)
+                exec_device = get_execution_device(module)
+                offload_device = get_offloaded_device(module)
                 for name, _ in list(module.named_parameters()):
-                    delattr(module, name)
+                    delete_offload_parameter(module, name)
 
                 # replace with compressed parameters
                 for name, value in state_dict.items():
                     name = name.removeprefix(f"{prefix}.")
-                    value = value.to(device)
+                    value = value.to(exec_device)
                     param = torch.nn.Parameter(value, requires_grad=False)
-                    register_offload_parameter(module, name, param)
+                    register_offload_parameter(module, name, param, offload_device)
 
                 module.quantization_status = QuantizationStatus.COMPRESSED
 
@@ -460,30 +462,26 @@ def decompress_model(self, model: Module):
 
                 # quantization second
                 if prefix in module_to_scheme:
-                    generator = self.quantization_compressor.decompress_from_state_dict(
-                        state_dict,
-                        names_to_scheme=module_to_scheme,
+                    state_dict = (
+                        self.quantization_compressor.decompress_module_from_state_dict(
+                            prefix,
+                            state_dict,
+                            scheme=module_to_scheme[prefix],
+                        )
                     )
-                    # generates (mod_path, {param_name, param_val})
-                    # of compressed params and used params, but not unused params
-                    # some used params are removed by get_unexpected_file_keys
-                    state_dict = {
-                        merge_names(module_path, param_name): param_value
-                        for module_path, compressed_data in generator
-                        for param_name, param_value in compressed_data.items()
-                    }
 
                 # remove any existing parameters
-                device = get_execution_device(module)
+                exec_device = get_execution_device(module)
+                offload_device = get_offloaded_device(module)
                 for name, _ in list(module.named_parameters()):
                     delete_offload_parameter(module, name)
 
                 # replace with decompressed parameters
                 for name, value in state_dict.items():
                     name = name.removeprefix(f"{prefix}.")
-                    value = value.to(device)
+                    value = value.to(exec_device)
                     param = torch.nn.Parameter(value, requires_grad=False)
-                    register_offload_parameter(module, name, param)
+                    register_offload_parameter(module, name, param, offload_device)
 
                 module.quantization_status = QuantizationStatus.FROZEN
 
 
@@ -24,6 +24,7 @@
     get_nested_weight_mappings,
     merge_names,
 )
+from compressed_tensors.utils.safetensors_load import match_param_name
 from safetensors import safe_open
 from torch import Tensor
 from tqdm import tqdm
@@ -223,9 +224,7 @@ def decompress_from_state_dict(
             state_dict, self.compression_param_names
         )
         for module_path in weight_mappings.keys():
-            weight_data = {}
-            for param_name, param_value in weight_mappings[module_path].items():
-                weight_data[param_name] = param_value
+            weight_data = weight_mappings[module_path].copy()
 
             if "weight_scale" in weight_data:
                 quant_args = names_to_scheme[module_path].weights
@@ -234,3 +233,31 @@ def decompress_from_state_dict(
                 )
                 weight_data["weight"] = decompressed
                 yield module_path, weight_data
+
+    def decompress_module_from_state_dict(
+        self,
+        prefix: str,
+        state_dict: Dict[str, torch.Tensor],
+        scheme: QuantizationScheme,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Only used by in-memory decompression pathways to decompress the parameters of
+        one module
+
+        :param prefix: prefix of state_dict, typically the path to the module
+        :param state_dict: state dict containing module parameter values
+        :param scheme: quantization scheme of module to decompress
+        :return: state dict with weight decompressed if applicable
+        """
+        state_dict = {
+            key.removeprefix(f"{prefix}."): value for key, value in state_dict.items()
+        }
+
+        if "weight_scale" in state_dict:
+            state_dict["weight"] = self.decompress_weight(
+                compressed_data=state_dict, quantization_args=scheme.weights
+            )
+
+        state_dict = {f"{prefix}.{key}": value for key, value in state_dict.items()}
+
+        return state_dict
@@ -27,14 +27,8 @@
 )
 from compressed_tensors.quantization.lifecycle.initialize import (
     initialize_module_for_quantization,
-    update_fused_layer_weight_global_scales,
-)
-from compressed_tensors.quantization.quant_args import (
-    FP4_E2M1_DATA,
-    FP8_E4M3_DATA,
-    QuantizationArgs,
-    QuantizationType,
 )
+from compressed_tensors.quantization.quant_args import QuantizationArgs
 from compressed_tensors.quantization.quant_config import (
     QuantizationConfig,
     QuantizationStatus,
@@ -272,9 +266,6 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
             )
         )
 
-        if status == QuantizationStatus.INITIALIZED:
-            update_fused_layer_weight_global_scales(model)
-
     if current_status < status >= QuantizationStatus.COMPRESSED > current_status:
         model.apply(compress_quantized_weights)
 
 
@@ -21,7 +21,6 @@
     DynamicType,
     QuantizationArgs,
     QuantizationStrategy,
-    QuantizationType,
     round_to_quantized_type,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
@@ -227,31 +226,42 @@ def _process_quantization(
             perm = torch.argsort(g_idx)
             x = safe_permute(x, perm, dim=1)
 
-        # TODO: experiment with vectorizing for loop for performance
-        end = 0
-        for index, group_count in enumerate(group_sizes):
-            sc = scale[:, index].view(-1, 1)
-            zp = zero_point[:, index].view(-1, 1) if zero_point is not None else None
-
-            start = end
-            end = start + group_count
-            if do_quantize:
-                output[:, start:end] = _quantize(
-                    x=x[:, start:end],
-                    scale=sc,
-                    zero_point=zp,
-                    q_min=q_min,
-                    q_max=q_max,
-                    args=args,
-                    dtype=dtype,
-                    global_scale=global_scale,
-                )
+        x = torch.reshape(
+            x,
+            (
+                x.shape[0],
+                ceil(x.shape[1] / group_size),
+                group_size,
+            ),
+        )
 
-            if do_dequantize:
-                input = output[:, start:end] if do_quantize else x[:, start:end]
-                output[:, start:end] = _dequantize(
-                    x_q=input, scale=sc, zero_point=zp, global_scale=global_scale
-                )
+        if do_quantize:
+            output = _quantize(
+                x=x,
+                scale=scale.unsqueeze(-1),
+                zero_point=zero_point.unsqueeze(-1) if zero_point is not None else None,
+                dtype=dtype,
+                global_scale=global_scale,
+                q_min=q_min,
+                q_max=q_max,
+                args=args,
+            )
+
+        if do_dequantize:
+            input = output if do_quantize else x
+            output = _dequantize(
+                x_q=input,
+                scale=scale.unsqueeze(-1),
+                zero_point=zero_point.unsqueeze(-1) if zero_point is not None else None,
+                global_scale=global_scale,
+            )
+
+        output = torch.reshape(
+            output,
+            (output.shape[0], output.shape[1] * output.shape[2]),
+        )
+
+        output = output.to(output_dtype)
 
         if not is_column_order:
             output = safe_permute(output, torch.argsort(perm), dim=1)
@@ -394,7 +404,7 @@ def _quantize(
 
     # if a global scale is optionally provided, use it
     # to further scale the local `scale` parameter
-    if global_scale:
+    if global_scale is not None:
         scale = scale.to(global_scale.dtype) / global_scale
 
     scaled = x / scale
@@ -427,7 +437,7 @@ def _dequantize(
 
     # if a global scale is optionally provided, use it
     # to further scale the local `scale` parameter
-    if global_scale:
+    if global_scale is not None:
         scale = scale.to(global_scale.dtype) / global_scale
 
     dequant_value = x_q.to(scale.dtype)
 
@@ -23,26 +23,18 @@
     wrap_module_forward_quantized,
 )
 from compressed_tensors.quantization.quant_args import (
-    FP4_E2M1_DATA,
     FP8_E4M3_DATA,
     ActivationOrdering,
     QuantizationArgs,
     QuantizationStrategy,
-    QuantizationType,
 )
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
-from compressed_tensors.quantization.utils import (
-    generate_global_scale,
-    is_fp4,
-    is_kv_cache_quant_scheme,
-    iter_named_quantizable_modules,
-)
+from compressed_tensors.quantization.utils import is_fp4, is_kv_cache_quant_scheme
 from compressed_tensors.utils import (
     disable_hf_hook,
     get_execution_device,
     register_offload_parameter,
-    update_parameter_data,
 )
 from torch.nn import Module, Parameter
 
@@ -51,7 +43,6 @@
     "initialize_module_for_quantization",
     "is_attention_module",
     "KVCacheScaleType",
-    "update_fused_layer_weight_global_scales",
 ]
 
 
@@ -162,22 +153,13 @@ def _initialize_scale_zero_point(
     # initialize on execution device to avoid performing quantized ops on cpu
     device = get_execution_device(module)
 
-    # 1. Create global_scales for tensor_group
+    # 1. Create global_scales for tensor_group - generates
+    # a per tensor scale
     if quantization_args.strategy == QuantizationStrategy.TENSOR_GROUP:
-        # TODO: should move to llmcompressor
-        if base_name == "weight":
-            # When applying weight-only FP4 quantization, generate a global_scale
-            # This scale is applied during runtime to ensure that the generated
-            # local scale falls properly within the FP8 range (i.e max value is FP8_max)
-            # which is the expected dtype of NVFP4A16 scales
-            value = generate_global_scale(input_tensor=module.weight)
-            value = value.to(device)
-            init_global_scale = Parameter(value, requires_grad=False)
-        else:
-            init_global_scale = Parameter(
-                torch.empty(1, dtype=torch.float32, device=device),
-                requires_grad=False,
-            )
+        init_global_scale = Parameter(
+            torch.empty(1, dtype=torch.float32, device=device),
+            requires_grad=False,
+        )
         register_offload_parameter(
             module, f"{base_name}_global_scale", init_global_scale
         )
@@ -258,91 +240,3 @@ def _initialize_attn_scales(module: Module) -> None:
         requires_grad=False,
     )
     register_offload_parameter(module, KVCacheScaleType.VALUE.value, init_scale)
-
-
-# TODO: Potentially introduce an argument to turn this off
-# Only relevant for NVFP4A16 currently
-def update_fused_layer_weight_global_scales(model: torch.nn.Module):
-    """
-    When running NVFP4A16 quantization, update the global scale
-    such that q,k,v layers are treated as one tensor with the same
-    global_scale and gate_proj/up_proj layers are treated as one tensor
-    with the same global scale. This is requirement currently being set
-    by vLLM and may be removed in the future OR potentially make it
-    an optional step.
-
-    :param model: model to quantize
-    """
-
-    def _is_attention_module(module: Module):
-        return "attention" in module.__class__.__name__.lower() and (
-            hasattr(module, "k_proj")
-            or hasattr(module, "v_proj")
-            or hasattr(module, "qkv_proj")
-        )
-
-    def _is_mlp_module(module: Module):
-        return "mlp" in module.__class__.__name__.lower() and (
-            hasattr(module, "gate_proj") or hasattr(module, "up_proj")
-        )
-
-    def _valid_fp4_quant(layer_list: List[torch.nn.Linear]):
-        """
-        Return True if all the linear layers in the layer_list are
-        NVFP4A16 quantized.
-        """
-        for layer in layer_list:
-            scheme = getattr(layer, "quantization_scheme", None)
-            if scheme is None:
-                return False
-
-            weight_quant_args = scheme.weights
-
-            if weight_quant_args is None:
-                return False
-
-            if not is_fp4(quantization_args=weight_quant_args):
-                return False
-        return True
-
-    for name, submodule in iter_named_quantizable_modules(
-        model,
-        include_attn=True,
-        include_mlp=True,
-    ):
-
-        if _is_attention_module(submodule):
-            # already fused/treated as one layer
-            if hasattr(submodule, "qkv_proj"):
-                continue
-
-            if not _valid_fp4_quant(
-                [submodule.q_proj, submodule.v_proj, submodule.k_proj]
-            ):
-                continue
-
-            q_weight = submodule.q_proj.weight.data
-            v_weight = submodule.v_proj.weight.data
-            k_weight = submodule.k_proj.weight.data
-
-            value = generate_global_scale(
-                input_tensor=torch.cat((q_weight, v_weight, k_weight), dim=0)
-            )
-
-            update_parameter_data(submodule.q_proj, value, "weight_global_scale")
-            update_parameter_data(submodule.k_proj, value, "weight_global_scale")
-            update_parameter_data(submodule.v_proj, value, "weight_global_scale")
-
-        if _is_mlp_module(submodule):
-            if not _valid_fp4_quant([submodule.gate_proj, submodule.up_proj]):
-                continue
-
-            gate_data = submodule.gate_proj.weight.data
-            up_data = submodule.up_proj.weight.data
-
-            value = generate_global_scale(
-                input_tensor=torch.cat((gate_data, up_data), dim=0)
-            )
-
-            update_parameter_data(submodule.gate_proj, value, "weight_global_scale")
-            update_parameter_data(submodule.up_proj, value, "weight_global_scale")
Original file line number	Diff line number	Diff line change
`@@ -27,14 +27,8 @@`
`27`	`27`	`)`
`28`	`28`	`from compressed_tensors.quantization.lifecycle.initialize import (`
`29`	`29`	`initialize_module_for_quantization,`
`30`		`- update_fused_layer_weight_global_scales,`
`31`		`-)`
`32`		`-from compressed_tensors.quantization.quant_args import (`
`33`		`- FP4_E2M1_DATA,`
`34`		`- FP8_E4M3_DATA,`
`35`		`- QuantizationArgs,`
`36`		`- QuantizationType,`
`37`	`30`	`)`
	`31`	`+from compressed_tensors.quantization.quant_args import QuantizationArgs`
`38`	`32`	`from compressed_tensors.quantization.quant_config import (`
`39`	`33`	`QuantizationConfig,`
`40`	`34`	`QuantizationStatus,`
`@@ -272,9 +266,6 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):`
`272`	`266`	`)`
`273`	`267`	`)`
`274`	`268`
`275`		`- if status == QuantizationStatus.INITIALIZED:`
`276`		`- update_fused_layer_weight_global_scales(model)`
`277`		`-`
`278`	`269`	`if current_status < status >= QuantizationStatus.COMPRESSED > current_status:`
`279`	`270`	`model.apply(compress_quantized_weights)`
`280`	`271`