Skip to content

Commit 5375f18

Browse files
authored
[Examples] [Bugfix] Perform sample generation before saving as compressed (#1530)
## Purpose ## * Fix failing examples ## Changes ## * Save model after generation in all examples * Previously, models would be saved before generation, causing generation to fail because we do not fully support generating with compressed models atm ## Future ## * In the future, we can define a better API around compressing and decompressing models which does not require so many arguments * In the future, we can standardize around reloading (and redispatching) the model before generation, as suggested here #1263 * In the future, we can remove the sample generation step Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
1 parent f773617 commit 5375f18

File tree

10 files changed

+55
-40
lines changed

10 files changed

+55
-40
lines changed

examples/quantization_w8a8_fp8/gemma2_example.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,10 @@
2020
)
2121

2222
# 3) Apply quantization and save in compressed-tensors format.
23-
OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
2423
oneshot(
2524
model=model,
2625
recipe=recipe,
2726
tokenizer=tokenizer,
28-
output_dir=OUTPUT_DIR,
2927
)
3028

3129
# Confirm generations of the quantized model look sane.
@@ -37,3 +35,8 @@
3735
output = model.generate(input_ids, max_new_tokens=20)
3836
print(tokenizer.decode(output[0]))
3937
print("==========================================")
38+
39+
# 4) Save to disk in compressed-tensors format.
40+
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
41+
model.save_pretrained(SAVE_DIR, save_compressed=True)
42+
tokenizer.save_pretrained(SAVE_DIR)

examples/quantization_w8a8_fp8/llama3.2_vision_example.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,16 @@
2222
)
2323

2424
# Apply quantization and save to disk in compressed-tensors format.
25-
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
26-
oneshot(
27-
model=model,
28-
recipe=recipe,
29-
output_dir=SAVE_DIR,
30-
)
31-
processor.save_pretrained(SAVE_DIR)
25+
oneshot(model=model, recipe=recipe)
3226

3327
# Confirm generations of the quantized model look sane.
3428
print("========== SAMPLE GENERATION ==============")
3529
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
3630
output = model.generate(input_ids, max_new_tokens=20)
3731
print(processor.decode(output[0]))
3832
print("==========================================")
33+
34+
# Save to disk in compressed-tensors format.
35+
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
36+
model.save_pretrained(SAVE_DIR, save_compressed=True)
37+
processor.save_pretrained(SAVE_DIR)

examples/quantization_w8a8_fp8/llava1.5_example.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,16 @@
2222
)
2323

2424
# Apply quantization and save to disk in compressed-tensors format.
25-
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
26-
oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR)
27-
processor.save_pretrained(SAVE_DIR)
25+
oneshot(model=model, recipe=recipe)
2826

2927
# Confirm generations of the quantized model look sane.
3028
print("========== SAMPLE GENERATION ==============")
3129
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
3230
output = model.generate(input_ids, max_new_tokens=20)
3331
print(processor.decode(output[0]))
3432
print("==========================================")
33+
34+
# Save to disk in compressed-tensors format.
35+
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
36+
model.save_pretrained(SAVE_DIR, save_compressed=True)
37+
processor.save_pretrained(SAVE_DIR)

examples/quantization_w8a8_fp8/qwen2vl_example.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,16 @@
2222
)
2323

2424
# Apply quantization and save to disk in compressed-tensors format.
25-
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
26-
oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR)
27-
processor.save_pretrained(SAVE_DIR)
25+
oneshot(model=model, recipe=recipe)
2826

2927
# Confirm generations of the quantized model look sane.
3028
print("========== SAMPLE GENERATION ==============")
3129
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
3230
output = model.generate(input_ids, max_new_tokens=20)
3331
print(processor.decode(output[0]))
3432
print("==========================================")
33+
34+
# Save to disk in compressed-tensors format.
35+
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
36+
model.save_pretrained(SAVE_DIR, save_compressed=True)
37+
processor.save_pretrained(SAVE_DIR)

examples/quantization_w8a8_int8/gemma2_example.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,13 @@ def tokenize(sample):
5757
# * quantize the activations to int8 (dynamic per token)
5858
recipe = GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"])
5959

60-
# 4) Apply quantization and save to disk compressed.
60+
# 4) Apply quantization
6161
oneshot(
6262
model=model,
6363
dataset=ds,
6464
recipe=recipe,
6565
max_seq_length=MAX_SEQUENCE_LENGTH,
6666
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
67-
output_dir=MODEL_ID.split("/")[1] + "-INT8",
6867
)
6968

7069
# Confirm generations of the quantized model look sane.
@@ -76,3 +75,8 @@ def tokenize(sample):
7675
output = model.generate(input_ids, max_new_tokens=20)
7776
print(tokenizer.decode(output[0]))
7877
print("==========================================")
78+
79+
# 5) Save to disk in compressed-tensors format.
80+
SAVE_DIR = MODEL_ID.split("/")[1] + "-INT8"
81+
model.save_pretrained(SAVE_DIR, save_compressed=True)
82+
tokenizer.save_pretrained(SAVE_DIR)

examples/quantizing_moe/deepseek_moe_w4a16.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,6 @@ def tokenize(sample):
7070
# list so they remain at full precision
7171
recipe = "deepseek_recipe_w4a16.yaml"
7272

73-
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16"
74-
75-
7673
oneshot(
7774
model=model,
7875
dataset=ds,
@@ -81,7 +78,6 @@ def tokenize(sample):
8178
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
8279
save_compressed=True,
8380
trust_remote_code_model=True,
84-
output_dir=SAVE_DIR,
8581
)
8682

8783
# Confirm generations of the quantized model look sane.
@@ -98,6 +94,11 @@ def tokenize(sample):
9894
"deepseek models with transformers >= 4.48"
9995
)
10096

97+
# Save to disk in compressed-tensors format.
98+
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16"
99+
model.save_pretrained(SAVE_DIR, save_compressed=True)
100+
tokenizer.save_pretrained(SAVE_DIR)
101+
101102

102103
# Run the model on vLLM
103104
try:

examples/quantizing_moe/deepseek_moe_w8a8_fp8.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,17 +66,13 @@ def tokenize(sample):
6666
),
6767
]
6868

69-
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8"
70-
7169
oneshot(
7270
model=model,
7371
dataset=ds,
7472
recipe=recipe,
7573
max_seq_length=MAX_SEQUENCE_LENGTH,
7674
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
7775
trust_remote_code_model=True,
78-
save_compressed=True,
79-
output_dir=SAVE_DIR,
8076
)
8177

8278
# Confirm generations of the quantized model look sane.
@@ -94,3 +90,8 @@ def tokenize(sample):
9490
"WARNING: cannot perform sample generation of "
9591
"deepseek models with transformers >= 4.48"
9692
)
93+
94+
# Save to disk in compressed-tensors format.
95+
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8"
96+
model.save_pretrained(SAVE_DIR, save_compressed=True)
97+
tokenizer.save_pretrained(SAVE_DIR)

examples/quantizing_moe/deepseek_moe_w8a8_int8.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,17 +78,13 @@ def tokenize(sample):
7878
),
7979
]
8080

81-
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8"
82-
8381
oneshot(
8482
model=model,
8583
dataset=ds,
8684
recipe=recipe,
8785
max_seq_length=MAX_SEQUENCE_LENGTH,
8886
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
8987
trust_remote_code_model=True,
90-
save_compressed=True,
91-
output_dir=SAVE_DIR,
9288
)
9389

9490
# Confirm generations of the quantized model look sane.
@@ -107,3 +103,8 @@ def tokenize(sample):
107103
"WARNING: cannot perform sample generation of "
108104
"deepseek models with transformers >= 4.48"
109105
)
106+
107+
# Save to disk in compressed-tensors format.
108+
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8"
109+
model.save_pretrained(SAVE_DIR, save_compressed=True)
110+
tokenizer.save_pretrained(SAVE_DIR)

examples/quantizing_moe/mixtral_moe_w8a8_fp8.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,11 @@
2727
MAX_SEQ_LENGTH = 2048
2828
NUM_CALIBRATION_SAMPLES = 512
2929

30-
# Save location of quantized model
31-
SAVE_DIR = f"{MODEL_ID.split('/')[-1]}-FP8"
32-
SAVE_COMPRESSED = True
33-
30+
# Recipe
3431
layers_to_ignore: List[str] = [
3532
"lm_head",
3633
"re:.*block_sparse_moe.gate", # does not quantize well
3734
]
38-
3935
recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore)
4036

4137

@@ -47,8 +43,6 @@
4743
recipe=recipe,
4844
max_seq_length=MAX_SEQ_LENGTH,
4945
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
50-
save_compressed=SAVE_COMPRESSED,
51-
output_dir=SAVE_DIR,
5246
)
5347

5448
# Confirm generations of the quantized model look sane.
@@ -64,3 +58,8 @@
6458
"WARNING: cannot perform sample generation of "
6559
"deepseek models with transformers >= 4.48"
6660
)
61+
62+
# Save to disk in compressed-tensors format.
63+
SAVE_DIR = f"{MODEL_ID.split('/')[-1]}-FP8"
64+
model.save_pretrained(SAVE_DIR, save_compressed=True)
65+
tokenizer.save_pretrained(SAVE_DIR)

examples/quantizing_moe/qwen_moe_w4a16.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,6 @@ def tokenize(sample):
7070
ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
7171
)
7272

73-
SAVE_DIR = MODEL_ID.split("/")[1] + "-quantized.w4a16"
74-
75-
7673
oneshot(
7774
model=model,
7875
dataset=ds,
@@ -81,7 +78,6 @@ def tokenize(sample):
8178
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
8279
save_compressed=True,
8380
trust_remote_code_model=True,
84-
output_dir=SAVE_DIR,
8581
)
8682

8783
# Confirm generations of the quantized model look sane.
@@ -90,3 +86,8 @@ def tokenize(sample):
9086
output = model.generate(input_ids, max_new_tokens=20)
9187
print(tokenizer.decode(output[0]))
9288
print("==========================================")
89+
90+
# Save to disk in compressed-tensors format.
91+
SAVE_DIR = MODEL_ID.split("/")[1] + "-quantized.w4a16"
92+
model.save_pretrained(SAVE_DIR, save_compressed=True)
93+
tokenizer.save_pretrained(SAVE_DIR)

0 commit comments

Comments
 (0)