vllm-project · kylesayrs · Jun 19, 2025 · Jun 19, 2025 · Jun 19, 2025 · Jun 19, 2025
diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml
diff --git a/.../quantizing_moe/deepseek_moe_w8a8_int8.py → ...es/quantizing_moe/deepseekv2_5_example.py b/.../quantizing_moe/deepseek_moe_w8a8_int8.py → ...es/quantizing_moe/deepseekv2_5_example.py
@@ -12,18 +12,17 @@
 # previous version or upgrading to a version where this bug is fixed
 
 # select a Mixture of Experts model for quantization
-MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
 
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.
-# its recommended to use more calibration samples for MoE models so each expert is hit
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
 DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 2048
+NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
 
 
@@ -57,16 +56,12 @@ def tokenize(sample):
 
 ds = ds.map(tokenize, remove_columns=ds.column_names)
 
-# define a llmcompressor recipe for INT8 W8A8 quantization
+# Configure the quantization algorithm to run.
 # since the MoE gate layers are sensitive to quantization, we add them to the ignore
 # list so they remain at full precision
-recipe = [
-    GPTQModifier(
-        targets="Linear",
-        scheme="W8A8",
-        ignore=["lm_head", "re:.*mlp.gate$"],
-    ),
-]
+recipe = GPTQModifier(
+    targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"]
+)
 
 oneshot(
     model=model,
@@ -82,12 +77,10 @@ def tokenize(sample):
 if Version(__version__) < Version("4.48"):
     print("========== SAMPLE GENERATION ==============")
     dispatch_for_generation(model)
-    SAMPLE_INPUT = ["I love quantization because"]
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
-    output = model.generate(**inputs, max_length=50)
-    text_output = tokenizer.batch_decode(output)
-    print(text_output)
+    sample = tokenizer("Hello my name is", return_tensors="pt")
+    sample = {key: value.to("cuda") for key, value in sample.items()}
+    output = model.generate(**sample, max_new_tokens=100)
+    print(tokenizer.decode(output[0]))
     print("==========================================")
 else:
     print(
@@ -96,6 +89,6 @@ def tokenize(sample):
     )
 
 # Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/...s/quantizing_moe/deepseek_moe_w8a8_fp8.py → examples/quantizing_moe/mixtral_example.py b/...s/quantizing_moe/deepseek_moe_w8a8_fp8.py → examples/quantizing_moe/mixtral_example.py
@@ -1,28 +1,23 @@
+import torch
 from datasets import load_dataset
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.utils import dispatch_for_generation
 
-# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
-# Please consider either downgrading your transformers version to a
-# previous version or upgrading to a version where this bug is fixed
-
 # select a Mixture of Experts model for quantization
-MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype="auto", trust_remote_code=True
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.
-# its recommended to use more calibration samples for MoE models so each expert is hit
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
 DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 2048
+NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
 
 
@@ -56,16 +51,17 @@ def tokenize(sample):
 
 ds = ds.map(tokenize, remove_columns=ds.column_names)
 
-# define a llmcompressor recipe for FP8 W8A8 quantization
+# Configure the quantization algorithm to run.
 # since the MoE gate layers are sensitive to quantization, we add them to the ignore
 # list so they remain at full precision
-recipe = [
-    QuantizationModifier(
-        targets="Linear",
-        scheme="FP8",
-        ignore=["lm_head", "re:.*mlp.gate$"],
-    ),
-]
+recipe = QuantizationModifier(
+    scheme="W4A16",
+    targets="Linear",
+    ignore=[
+        "lm_head",
+        "re:.*block_sparse_moe.gate",  # does not quantize well
+    ],
+)
 
 oneshot(
     model=model,
@@ -76,24 +72,15 @@ def tokenize(sample):
     trust_remote_code_model=True,
 )
 
-# Confirm generations of the quantized model look sane.
-# Generation is broken for deepseek models when using the latest transformers package
-if Version(__version__) < Version("4.48"):
-    print("========== SAMPLE GENERATION ==============")
-    dispatch_for_generation(model)
-    SAMPLE_INPUT = ["I love quantization because"]
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
-    output = model.generate(**inputs, max_length=50)
-    text_output = tokenizer.batch_decode(output)
-    print(text_output)
-else:
-    print(
-        "WARNING: cannot perform sample generation of "
-        "deepseek models with transformers >= 4.48"
-    )
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to("cuda") for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================")
 
 # Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py → examples/quantizing_moe/qwen_example.py b/examples/quantizing_moe/qwen_moe_w4a16.py → examples/quantizing_moe/qwen_example.py
@@ -73,12 +73,13 @@ def tokenize(sample):
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=20)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to("cuda") for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================")
 
 # Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-quantized.w4a16"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
@@ -171,6 +171,7 @@ class DatasetArguments(CustomDatasetArguments):
             "will execute code present on the Hub on your local machine."
         },
     )
+    # --- pipeline arguments --- #
     pipeline: Optional[str] = field(
         default="independent",
         metadata={
@@ -196,3 +197,10 @@ class DatasetArguments(CustomDatasetArguments):
             "definition"
         },
     )
+    model_input_device: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Device to put model inputs on for calibration. "
+            "If none is specified, the model input device is inferred from the model"
+        },
+    )