Skip to content

Commit c48823f

Browse files
authored
Merge branch 'kylesayrs/sequential-onloading' into kylesayrs/deactivate-sequential_targets
2 parents ab9dee0 + 6edc523 commit c48823f

File tree

3 files changed

+12
-13
lines changed

3 files changed

+12
-13
lines changed

examples/quantization_w8a8_int8/gemma2_example.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,11 @@ def tokenize(sample):
6767
# NOTE: transformers 4.49.0 results in a generation error with gemma2.
6868
# Consider either downgrading your transformers version to a previous version
6969
# or use vLLM for sample generation.
70+
# Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333
7071
print("========== SAMPLE GENERATION ==============")
7172
dispatch_for_generation(model)
7273
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
73-
output = model.generate(input_ids, max_new_tokens=20)
74+
output = model.generate(input_ids, max_new_tokens=20, disable_compile=True)
7475
print(tokenizer.decode(output[0]))
7576
print("==========================================")
7677

src/llmcompressor/modifiers/utils/helpers.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,11 @@ def _valid_tensor_group_quant(layer_list: List[Linear]):
7272
)
7373
).reshape([1])
7474

75-
update_parameter_data(submodule.q_proj, global_scale, "weight_global_scale")
76-
update_parameter_data(submodule.k_proj, global_scale, "weight_global_scale")
77-
update_parameter_data(submodule.v_proj, global_scale, "weight_global_scale")
78-
del global_scale
75+
update_parameter_data(submodule.k_proj, global_scale, "weight_global_scale")
76+
update_parameter_data(submodule.q_proj, global_scale, "weight_global_scale")
77+
update_parameter_data(submodule.v_proj, global_scale, "weight_global_scale")
78+
79+
del global_scale
7980

8081
if _is_mlp_module(submodule):
8182
if not _valid_tensor_group_quant([submodule.gate_proj, submodule.up_proj]):
@@ -91,10 +92,7 @@ def _valid_tensor_group_quant(layer_list: List[Linear]):
9192
)
9293
).reshape([1])
9394

94-
update_parameter_data(
95-
submodule.gate_proj, global_scale, "weight_global_scale"
96-
)
97-
update_parameter_data(
98-
submodule.up_proj, global_scale, "weight_global_scale"
99-
)
100-
del global_scale
95+
update_parameter_data(submodule.gate_proj, global_scale, "weight_global_scale")
96+
update_parameter_data(submodule.up_proj, global_scale, "weight_global_scale")
97+
98+
del global_scale

tests/llmcompressor/transformers/tracing/test_models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323

2424
@pytest.mark.skipif(
25-
os.getenv("HF_TOKEN") is None,
25+
(not os.getenv("HF_TOKEN")),
2626
reason="Skipping tracing tests requiring gated model access",
2727
)
2828
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)