[Examples] [Bugfix] Perform sample generation before saving as compressed (#1530)

kylesayrs · web-flow · commit 5375f1899235 · 2025-06-09T17:43:46.000-04:00
## Purpose ## * Fix failing examples ## Changes ## * Save model after generation in all examples * Previously, models would be saved before generation, causing generation to fail because we do not fully support generating with compressed models atm ## Future ## * In the future, we can define a better API around compressing and decompressing models which does not require so many arguments * In the future, we can standardize around reloading (and redispatching) the model before generation, as suggested here #1263 * In the future, we can remove the sample generation step Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py
@@ -20,12 +20,10 @@
 )
 
 # 3) Apply quantization and save in compressed-tensors format.
-OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
 oneshot(
     model=model,
     recipe=recipe,
     tokenizer=tokenizer,
-    output_dir=OUTPUT_DIR,
 )
 
 # Confirm generations of the quantized model look sane.
@@ -37,3 +35,8 @@
 output = model.generate(input_ids, max_new_tokens=20)
 print(tokenizer.decode(output[0]))
 print("==========================================")
+
+# 4) Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
@@ -22,17 +22,16 @@
 )
 
 # Apply quantization and save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-oneshot(
-    model=model,
-    recipe=recipe,
-    output_dir=SAVE_DIR,
-)
-processor.save_pretrained(SAVE_DIR)
+oneshot(model=model, recipe=recipe)
 
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)
 print(processor.decode(output[0]))
 print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py
@@ -22,13 +22,16 @@
 )
 
 # Apply quantization and save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR)
-processor.save_pretrained(SAVE_DIR)
+oneshot(model=model, recipe=recipe)
 
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)
 print(processor.decode(output[0]))
 print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/qwen2vl_example.py b/examples/quantization_w8a8_fp8/qwen2vl_example.py
@@ -22,13 +22,16 @@
 )
 
 # Apply quantization and save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR)
-processor.save_pretrained(SAVE_DIR)
+oneshot(model=model, recipe=recipe)
 
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)
 print(processor.decode(output[0]))
 print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py
@@ -57,14 +57,13 @@ def tokenize(sample):
 #   * quantize the activations to int8 (dynamic per token)
 recipe = GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"])
 
-# 4) Apply quantization and save to disk compressed.
+# 4) Apply quantization
 oneshot(
     model=model,
     dataset=ds,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    output_dir=MODEL_ID.split("/")[1] + "-INT8",
 )
 
 # Confirm generations of the quantized model look sane.
@@ -76,3 +75,8 @@ def tokenize(sample):
 output = model.generate(input_ids, max_new_tokens=20)
 print(tokenizer.decode(output[0]))
 print("==========================================")
+
+# 5) Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-INT8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
@@ -70,9 +70,6 @@ def tokenize(sample):
 # list so they remain at full precision
 recipe = "deepseek_recipe_w4a16.yaml"
 
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16"
-
-
 oneshot(
     model=model,
     dataset=ds,
@@ -81,7 +78,6 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     save_compressed=True,
     trust_remote_code_model=True,
-    output_dir=SAVE_DIR,
 )
 
 # Confirm generations of the quantized model look sane.
@@ -98,6 +94,11 @@ def tokenize(sample):
         "deepseek models with transformers >= 4.48"
     )
 
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
 
 # Run the model on vLLM
 try:
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
@@ -66,17 +66,13 @@ def tokenize(sample):
     ),
 ]
 
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8"
-
 oneshot(
     model=model,
     dataset=ds,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
-    save_compressed=True,
-    output_dir=SAVE_DIR,
 )
 
 # Confirm generations of the quantized model look sane.
@@ -94,3 +90,8 @@ def tokenize(sample):
         "WARNING: cannot perform sample generation of "
         "deepseek models with transformers >= 4.48"
     )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
@@ -78,17 +78,13 @@ def tokenize(sample):
     ),
 ]
 
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8"
-
 oneshot(
     model=model,
     dataset=ds,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
-    save_compressed=True,
-    output_dir=SAVE_DIR,
 )
 
 # Confirm generations of the quantized model look sane.
@@ -107,3 +103,8 @@ def tokenize(sample):
         "WARNING: cannot perform sample generation of "
         "deepseek models with transformers >= 4.48"
     )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
@@ -27,15 +27,11 @@
 MAX_SEQ_LENGTH = 2048
 NUM_CALIBRATION_SAMPLES = 512
 
-# Save location of quantized model
-SAVE_DIR = f"{MODEL_ID.split('/')[-1]}-FP8"
-SAVE_COMPRESSED = True
-
+# Recipe
 layers_to_ignore: List[str] = [
     "lm_head",
     "re:.*block_sparse_moe.gate",  # does not quantize well
 ]
-
 recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore)
 
 
@@ -47,8 +43,6 @@
     recipe=recipe,
     max_seq_length=MAX_SEQ_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    save_compressed=SAVE_COMPRESSED,
-    output_dir=SAVE_DIR,
 )
 
 # Confirm generations of the quantized model look sane.
@@ -64,3 +58,8 @@
         "WARNING: cannot perform sample generation of "
         "deepseek models with transformers >= 4.48"
     )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = f"{MODEL_ID.split('/')[-1]}-FP8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py
@@ -70,9 +70,6 @@ def tokenize(sample):
     ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
 )
 
-SAVE_DIR = MODEL_ID.split("/")[1] + "-quantized.w4a16"
-
-
 oneshot(
     model=model,
     dataset=ds,
@@ -81,7 +78,6 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     save_compressed=True,
     trust_remote_code_model=True,
-    output_dir=SAVE_DIR,
 )
 
 # Confirm generations of the quantized model look sane.
@@ -90,3 +86,8 @@ def tokenize(sample):
 output = model.generate(input_ids, max_new_tokens=20)
 print(tokenizer.decode(output[0]))
 print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-quantized.w4a16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)