skip generation

kylesayrs · kylesayrs · commit ad506fa6538b · 2025-06-21T14:22:08.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/examples/quantizing_moe/deepseekv3_example.py b/examples/quantizing_moe/deepseekv3_example.py
@@ -4,7 +4,6 @@
 from llmcompressor.modeling import prepare_for_quantization
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
-from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 # For DeepSeekv3, we require a full precision model in order to properly calibrate
@@ -72,16 +71,6 @@ def tokenize(sample):
     sequential_targets=["DeepseekV3Attention", "DeepseekV3MLP"],
 )
 
-# Confirm generations of the quantized model look sane.
-print("\n\n")
-print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
-sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to("cuda") for key, value in sample.items()}
-output = model.generate(**sample, max_new_tokens=100)
-print(tokenizer.decode(output[0]))
-print("==========================================\n\n")
-
 # Save to disk compressed.
 SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)