[NVFP4] Fix global scale update when dealing with offloaded layers (#1554)

dsikka · web-flow · commit 1c4f63903bed · 2025-06-16T21:54:13.000Z
SUMMARY:

- Updating the global scale using the `align_module` context does not
persist the scale parameter
- Update outside of the context so that the offloaded dict is upadated
as well

Testing
- Resolves CPU offloading issues seen with a Llama 70b FP4
diff --git a/src/llmcompressor/modifiers/utils/helpers.py b/src/llmcompressor/modifiers/utils/helpers.py
@@ -72,10 +72,11 @@ def _valid_tensor_group_quant(layer_list: List[Linear]):
                 )
             ).reshape([1])
 
-            update_parameter_data(submodule.q_proj, global_scale, "weight_global_scale")
-            update_parameter_data(submodule.k_proj, global_scale, "weight_global_scale")
-            update_parameter_data(submodule.v_proj, global_scale, "weight_global_scale")
-            del global_scale
+        update_parameter_data(submodule.k_proj, global_scale, "weight_global_scale")
+        update_parameter_data(submodule.q_proj, global_scale, "weight_global_scale")
+        update_parameter_data(submodule.v_proj, global_scale, "weight_global_scale")
+
+        del global_scale
 
     if _is_mlp_module(submodule):
         if not _valid_tensor_group_quant([submodule.gate_proj, submodule.up_proj]):
@@ -91,10 +92,7 @@ def _valid_tensor_group_quant(layer_list: List[Linear]):
                 )
             ).reshape([1])
 
-            update_parameter_data(
-                submodule.gate_proj, global_scale, "weight_global_scale"
-            )
-            update_parameter_data(
-                submodule.up_proj, global_scale, "weight_global_scale"
-            )
-            del global_scale
+        update_parameter_data(submodule.gate_proj, global_scale, "weight_global_scale")
+        update_parameter_data(submodule.up_proj, global_scale, "weight_global_scale")
+
+        del global_scale