[Bugfix] Fix gemma2 generation (#1552)

kylesayrs · web-flow · commit 6895af120e44 · 2025-06-16T10:45:24.000-04:00
## Purpose ## * Fix gemma2 generation * See #1517 Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py
@@ -70,9 +70,10 @@ def tokenize(sample):
 # NOTE: transformers 4.49.0 results in a generation error with gemma2.
 # Consider either downgrading your transformers version to a previous version
 # or use vLLM for sample generation.
+# Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333
 print("========== SAMPLE GENERATION ==============")
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=20)
+output = model.generate(input_ids, max_new_tokens=20, disable_compile=True)
 print(tokenizer.decode(output[0]))
 print("==========================================")