[Bugfix] [Examples] Fix remaining Gemma generations (#1604)

kylesayrs · web-flow · commit 5f9af3cf73fc · 2025-06-30T21:10:03.000Z
## Purpose ## * Fix generation for gemma models * See: #1517 ## Changes ## * Disable compilation for remaining gemma models --------- Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
@@ -87,11 +87,12 @@ def process_and_tokenize(example):
 # NOTE: transformers 4.49.0 results in a generation error with gemma2.
 # Consider either downgrading your transformers version to a previous version
 # or use vLLM for sample generation.
+# Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333
 print("\n\n")
 dispatch_for_generation(model)
 print("========== SAMPLE GENERATION ==============")
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=100)
+output = model.generate(input_ids, max_new_tokens=100, disable_compile=True)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
 
diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py
@@ -29,10 +29,11 @@
 # NOTE: transformers 4.49.0 results in a generation error with gemma2.
 # Consider either downgrading your transformers version to a previous version
 # or use vLLM for sample generation.
+# Note: compile is disabled: https://github.com/huggingface/transformers/issues/38333
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=20)
+output = model.generate(input_ids, max_new_tokens=20, disable_compile=True)
 print(tokenizer.decode(output[0]))
 print("==========================================")