[NVFP4] Fix onloading of fused layers (#1512)

dsikka · web-flow · commit e7c6ef485c3a · 2025-06-04T21:47:57.000Z
Summary
- Properly onload qkv and gate/up layers when updating global scales
with cpu offloading

Testing:
- Tested in memory-retrained case to ensure proper behaviour
diff --git a/src/llmcompressor/modifiers/utils/helpers.py b/src/llmcompressor/modifiers/utils/helpers.py
@@ -2,7 +2,7 @@
 
 import torch
 from compressed_tensors.quantization import QuantizationStrategy
-from compressed_tensors.utils import align_module_device, update_parameter_data
+from compressed_tensors.utils import align_modules, update_parameter_data
 from torch.nn import Linear, Module
 
 __all__ = ["update_fused_layer_weight_global_scales"]
@@ -51,17 +51,17 @@ def _valid_tensor_group_quant(layer_list: List[Linear]):
                 return False
         return True
 
-    with align_module_device(submodule):
-        if _is_attention_module(submodule):
-            # already fused/treated as one layer
-            if hasattr(submodule, "qkv_proj"):
-                return
+    if _is_attention_module(submodule):
+        # already fused/treated as one layer
+        if hasattr(submodule, "qkv_proj"):
+            return
 
-            if not _valid_tensor_group_quant(
-                [submodule.q_proj, submodule.v_proj, submodule.k_proj]
-            ):
-                return
+        if not _valid_tensor_group_quant(
+            [submodule.q_proj, submodule.v_proj, submodule.k_proj]
+        ):
+            return
 
+        with align_modules([submodule.q_proj, submodule.v_proj, submodule.k_proj]):
             global_scale = torch.min(
                 torch.cat(
                     (
@@ -70,29 +70,31 @@ def _valid_tensor_group_quant(layer_list: List[Linear]):
                         submodule.v_proj.weight_global_scale.data,
                     )
                 )
-            )
+            ).reshape([1])
 
             update_parameter_data(submodule.q_proj, global_scale, "weight_global_scale")
             update_parameter_data(submodule.k_proj, global_scale, "weight_global_scale")
             update_parameter_data(submodule.v_proj, global_scale, "weight_global_scale")
+            del global_scale
 
-    with align_module_device(submodule):
-        if _is_mlp_module(submodule):
-            if not _valid_tensor_group_quant([submodule.gate_proj, submodule.up_proj]):
-                return
+    if _is_mlp_module(submodule):
+        if not _valid_tensor_group_quant([submodule.gate_proj, submodule.up_proj]):
+            return
 
+        with align_modules([submodule.gate_proj, submodule.up_proj]):
             global_scale = torch.min(
                 torch.cat(
                     (
                         submodule.gate_proj.weight_global_scale.data,
                         submodule.up_proj.weight_global_scale.data,
                     )
                 )
-            )
+            ).reshape([1])
 
             update_parameter_data(
                 submodule.gate_proj, global_scale, "weight_global_scale"
             )
             update_parameter_data(
                 submodule.up_proj, global_scale, "weight_global_scale"
             )
+            del global_scale