Merge branch 'kylesayrs/sequential-onloading' into kylesayrs/deactivate-sequential_targets

kylesayrs · web-flow · commit c48823f2ab0c · 2025-06-17T12:16:53.000-04:00
diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py
@@ -67,10 +67,11 @@ def tokenize(sample):
 # NOTE: transformers 4.49.0 results in a generation error with gemma2.
 # Consider either downgrading your transformers version to a previous version
 # or use vLLM for sample generation.
+# Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=20)
+output = model.generate(input_ids, max_new_tokens=20, disable_compile=True)
 print(tokenizer.decode(output[0]))
 print("==========================================")
 
diff --git a/src/llmcompressor/modifiers/utils/helpers.py b/src/llmcompressor/modifiers/utils/helpers.py
@@ -72,10 +72,11 @@ def _valid_tensor_group_quant(layer_list: List[Linear]):
                 )
             ).reshape([1])
 
-            update_parameter_data(submodule.q_proj, global_scale, "weight_global_scale")
-            update_parameter_data(submodule.k_proj, global_scale, "weight_global_scale")
-            update_parameter_data(submodule.v_proj, global_scale, "weight_global_scale")
-            del global_scale
+        update_parameter_data(submodule.k_proj, global_scale, "weight_global_scale")
+        update_parameter_data(submodule.q_proj, global_scale, "weight_global_scale")
+        update_parameter_data(submodule.v_proj, global_scale, "weight_global_scale")
+
+        del global_scale
 
     if _is_mlp_module(submodule):
         if not _valid_tensor_group_quant([submodule.gate_proj, submodule.up_proj]):
@@ -91,10 +92,7 @@ def _valid_tensor_group_quant(layer_list: List[Linear]):
                 )
             ).reshape([1])
 
-            update_parameter_data(
-                submodule.gate_proj, global_scale, "weight_global_scale"
-            )
-            update_parameter_data(
-                submodule.up_proj, global_scale, "weight_global_scale"
-            )
-            del global_scale
+        update_parameter_data(submodule.gate_proj, global_scale, "weight_global_scale")
+        update_parameter_data(submodule.up_proj, global_scale, "weight_global_scale")
+
+        del global_scale
diff --git a/tests/llmcompressor/transformers/tracing/test_models.py b/tests/llmcompressor/transformers/tracing/test_models.py
@@ -22,7 +22,7 @@
 
 
 @pytest.mark.skipif(
-    os.getenv("HF_TOKEN") is None,
+    (not os.getenv("HF_TOKEN")),
     reason="Skipping tracing tests requiring gated model access",
 )
 @pytest.mark.parametrize(

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@`
`22`	`22`
`23`	`23`
`24`	`24`	`@pytest.mark.skipif(`
`25`		`- os.getenv("HF_TOKEN") is None,`
	`25`	`+ (not os.getenv("HF_TOKEN")),`
`26`	`26`	`reason="Skipping tracing tests requiring gated model access",`
`27`	`27`	`)`
`28`	`28`	`@pytest.mark.parametrize(`