From b30eade3ed2b505eb59950609c9cbc6e728addc0 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 19 Jun 2025 10:55:47 -0400
Subject: [PATCH 01/22] deepseekv3

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantizing_moe/deepseekv3_example.py | 85 +++++++++++++++++++
 src/llmcompressor/entrypoints/oneshot.py      | 10 +++
 src/llmcompressor/modeling/__init__.py        |  3 +
 src/llmcompressor/modeling/deepseek_v3.py     | 48 +++++++++++
 src/llmcompressor/modeling/prepare.py         | 22 +++++
 src/llmcompressor/utils/module.py             | 27 ++++++
 6 files changed, 195 insertions(+)
 create mode 100644 examples/quantizing_moe/deepseekv3_example.py
 create mode 100644 src/llmcompressor/modeling/__init__.py
 create mode 100644 src/llmcompressor/modeling/deepseek_v3.py
 create mode 100644 src/llmcompressor/modeling/prepare.py
 create mode 100644 src/llmcompressor/utils/module.py

diff --git a/examples/quantizing_moe/deepseekv3_example.py b/examples/quantizing_moe/deepseekv3_example.py
new file mode 100644
index 000000000..ecec45a19
--- /dev/null
+++ b/examples/quantizing_moe/deepseekv3_example.py
@@ -0,0 +1,85 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor.modeling import prepare_for_quantization
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.transformers import oneshot
+
+# Select model and load it.
+model_id = "RedHatAI/DeepSeek-V3-BF16"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = prepare_for_quantization(model)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# Configure the quantization algorithm to run.
+#   * quantize the weights to 4 bit with GPTQ with a group size 128
+recipe = GPTQModifier(
+    targets="Linear",
+    scheme="W4A16",
+    ignore=["lm_head"],
+    sequential_targets=["DeepseekV3Attention", "DeepseekV3MLP"],
+)
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[-1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index 945c71943..fe5624cd8 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -2,6 +2,8 @@
 from datetime import datetime
 from typing import TYPE_CHECKING, List, Optional, Union
 
+import torch
+from compressed_tensors.utils import offloaded_dispatch
 from loguru import logger
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin
@@ -127,6 +129,14 @@ def __init__(
         # initialize the model and processor
         pre_process(model_args)
 
+        # offload to cpu if possible
+        if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
+            offloaded_dispatch(
+                model_args.model, execution_device=model_args.oneshot_device
+            )
+        else:
+            logger.warning("CUDA is not available! Compressing model on CPU instead")
+
         # Set instance attributes
         self.model = self.model_args.model
         self.processor = self.model_args.processor
diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py
new file mode 100644
index 000000000..e2c22ed1f
--- /dev/null
+++ b/src/llmcompressor/modeling/__init__.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+
+from .prepare import *
diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py
new file mode 100644
index 000000000..4b885ff64
--- /dev/null
+++ b/src/llmcompressor/modeling/deepseek_v3.py
@@ -0,0 +1,48 @@
+import torch
+from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE
+
+
+class DeepseekV3MoECalibrate(torch.nn.Module):
+    def __init__(self, config, experts, gate, shared_experts):
+        super().__init__()
+        self.config = config
+        self.experts = experts
+        self.gate = gate
+        self.shared_experts = shared_experts
+
+    def forward(self, hidden_states):
+        residuals = hidden_states
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+
+        # Begin MoE
+        final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+        expert_mask = torch.nn.functional.one_hot(
+            topk_indices, num_classes=len(self.experts)
+        )
+        expert_mask = expert_mask.permute(2, 0, 1)
+
+        for expert_idx in range(len(self.experts)):
+            expert = self.experts[expert_idx]
+            mask = expert_mask[expert_idx]
+            token_indices, weight_indices = torch.where(mask)
+
+            expert_weights = topk_weights[token_indices, weight_indices]
+            expert_input = hidden_states[token_indices]
+            expert_output = expert(expert_input)
+            weighted_output = expert_output * expert_weights.unsqueeze(-1)
+
+            if token_indices.numel() > 0:
+                final_hidden_states.index_add_(0, token_indices, weighted_output)
+        # End MoE
+
+        hidden_states = final_hidden_states.type(hidden_states.dtype).view(*orig_shape)
+        hidden_states = hidden_states + self.shared_experts(residuals)
+        return hidden_states
+
+
+def replace(module: DeepseekV3MoE) -> DeepseekV3MoECalibrate:
+    return DeepseekV3MoECalibrate(
+        module.config, module.experts, module.gate, module.shared_experts
+    )
diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py
new file mode 100644
index 000000000..a8dedf8ee
--- /dev/null
+++ b/src/llmcompressor/modeling/prepare.py
@@ -0,0 +1,22 @@
+import torch
+from transformers import PreTrainedModel
+from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE
+
+from llmcompressor.modeling.deepseek_v3 import replace as replace_DeepseekV3MoE
+from llmcompressor.utils.module import module_bfs
+
+__all__ = ["prepare_for_quantization"]
+
+replacements = {
+    DeepseekV3MoE: replace_DeepseekV3MoE,
+}
+
+
+def prepare_for_quantization(model: PreTrainedModel) -> PreTrainedModel:
+    def replace(module: torch.nn.Module) -> torch.nn.Module:
+        if module.__class__ in replacements:
+            return replacements[module.__class__](module)
+        else:
+            return module
+
+    return module_bfs(model, replace, progress=True)
diff --git a/src/llmcompressor/utils/module.py b/src/llmcompressor/utils/module.py
new file mode 100644
index 000000000..a02aa8b4a
--- /dev/null
+++ b/src/llmcompressor/utils/module.py
@@ -0,0 +1,27 @@
+from typing import Callable, Union
+
+import torch
+import tqdm
+
+__all__ = ["module_bfs"]
+
+
+def module_bfs(
+    module: torch.nn.Module,
+    func: Callable[[torch.nn.Module], torch.nn.Module],
+    pre: bool = True,
+    progress: Union[bool, tqdm.tqdm] = False,
+) -> torch.nn.Module:
+    if progress is True:
+        total = len(list(module.modules()))
+        progress = tqdm.tqdm(total=total)
+    if pre:
+        module = func(module)
+    for name, child in list(module.named_children()):
+        module.add_module(name, module_bfs(child, func, pre, progress))
+    if not pre:
+        module = func(module)
+    if isinstance(progress, tqdm.tqdm):
+        progress.update(1)
+
+    return module

From a957f2f2c98b3b5e3efa8fea5339dd1502682fe3 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 19 Jun 2025 10:56:53 -0400
Subject: [PATCH 02/22] remove dreg

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/entrypoints/oneshot.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index fe5624cd8..945c71943 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -2,8 +2,6 @@
 from datetime import datetime
 from typing import TYPE_CHECKING, List, Optional, Union
 
-import torch
-from compressed_tensors.utils import offloaded_dispatch
 from loguru import logger
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin
@@ -129,14 +127,6 @@ def __init__(
         # initialize the model and processor
         pre_process(model_args)
 
-        # offload to cpu if possible
-        if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
-            offloaded_dispatch(
-                model_args.model, execution_device=model_args.oneshot_device
-            )
-        else:
-            logger.warning("CUDA is not available! Compressing model on CPU instead")
-
         # Set instance attributes
         self.model = self.model_args.model
         self.processor = self.model_args.processor

From 2fd2a25569114ce8059bccfff2dc077790b38d0b Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 19 Jun 2025 11:03:33 -0400
Subject: [PATCH 03/22] reformat example

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantizing_moe/deepseekv3_example.py | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/quantizing_moe/deepseekv3_example.py b/examples/quantizing_moe/deepseekv3_example.py
index ecec45a19..b34a9faa7 100644
--- a/examples/quantizing_moe/deepseekv3_example.py
+++ b/examples/quantizing_moe/deepseekv3_example.py
@@ -4,6 +4,7 @@
 from llmcompressor.modeling import prepare_for_quantization
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
+from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 model_id = "RedHatAI/DeepSeek-V3-BF16"
@@ -68,18 +69,17 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[-1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=100)
+dispatch_for_generation(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to("cuda") for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)

From b8b217c7bfeff3992ac167db5fbdcaf1dc208dee Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 19 Jun 2025 11:24:23 -0400
Subject: [PATCH 04/22] wip: clean up moe examples

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantizing_moe/deepseek_moe_w4a16.py | 125 ------------------
 .../quantizing_moe/deepseek_moe_w8a8_fp8.py   |  99 --------------
 .../quantizing_moe/deepseek_recipe_w4a16.yaml |   8 --
 ...e_w8a8_int8.py => deepseekv2_5_example.py} |  29 ++--
 examples/quantizing_moe/deepseekv3_example.py |  13 +-
 .../quantizing_moe/mixtral_moe_w8a8_fp8.py    |  96 +++++++++-----
 examples/quantizing_moe/qwen_moe_w4a16.py     |   7 +-
 7 files changed, 85 insertions(+), 292 deletions(-)
 delete mode 100644 examples/quantizing_moe/deepseek_moe_w4a16.py
 delete mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
 delete mode 100644 examples/quantizing_moe/deepseek_recipe_w4a16.yaml
 rename examples/quantizing_moe/{deepseek_moe_w8a8_int8.py => deepseekv2_5_example.py} (76%)

diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
deleted file mode 100644
index 9880e9248..000000000
--- a/examples/quantizing_moe/deepseek_moe_w4a16.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import torch
-from datasets import load_dataset
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
-
-from llmcompressor import oneshot
-from llmcompressor.utils import dispatch_for_generation
-
-# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
-# Please consider either downgrading your transformers version to a
-# previous version or upgrading to a version where this bug is fixed
-
-# select a Mixture of Experts model for quantization
-MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# define a llmcompressor recipe for W416 quantization
-# since the MoE gate layers are sensitive to quantization, we add them to the ignore
-# list so they remain at full precision
-recipe = "deepseek_recipe_w4a16.yaml"
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    save_compressed=True,
-    trust_remote_code_model=True,
-)
-
-# Confirm generations of the quantized model look sane.
-# Generation is broken for deepseek models when using the latest transformers package
-if Version(__version__) < Version("4.48"):
-    print("========== SAMPLE GENERATION ==============")
-    dispatch_for_generation(model)
-    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-    output = model.generate(input_ids, max_new_tokens=20)
-    print(tokenizer.decode(output[0]))
-    print("==========================================")
-else:
-    print(
-        "WARNING: cannot perform sample generation of "
-        "deepseek models with transformers >= 4.48"
-    )
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-
-# Run the model on vLLM
-try:
-    from vllm import LLM, SamplingParams
-
-    vllm_installed = True
-except ImportError:
-    vllm_installed = False
-
-if vllm_installed:
-    print("vLLM installed, running using vLLM")
-    sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
-    llm = LLM(
-        model=SAVE_DIR,
-        tensor_parallel_size=2,
-        trust_remote_code=True,
-        max_model_len=1042,
-        dtype=torch.half,
-    )
-    prompts = [
-        "The capital of France is",
-        "The president of the US is",
-        "My name is",
-    ]
-
-    outputs = llm.generate(prompts, sampling_params)
-    print("================= vLLM GENERATION ======================")
-    for output in outputs:
-        assert output
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print("PROMPT", prompt)
-        print("GENERATED TEXT", generated_text)
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
deleted file mode 100644
index 0bc9c24df..000000000
--- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
+++ /dev/null
@@ -1,99 +0,0 @@
-from datasets import load_dataset
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils import dispatch_for_generation
-
-# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
-# Please consider either downgrading your transformers version to a
-# previous version or upgrading to a version where this bug is fixed
-
-# select a Mixture of Experts model for quantization
-MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype="auto", trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-# its recommended to use more calibration samples for MoE models so each expert is hit
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 2048
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# define a llmcompressor recipe for FP8 W8A8 quantization
-# since the MoE gate layers are sensitive to quantization, we add them to the ignore
-# list so they remain at full precision
-recipe = [
-    QuantizationModifier(
-        targets="Linear",
-        scheme="FP8",
-        ignore=["lm_head", "re:.*mlp.gate$"],
-    ),
-]
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    trust_remote_code_model=True,
-)
-
-# Confirm generations of the quantized model look sane.
-# Generation is broken for deepseek models when using the latest transformers package
-if Version(__version__) < Version("4.48"):
-    print("========== SAMPLE GENERATION ==============")
-    dispatch_for_generation(model)
-    SAMPLE_INPUT = ["I love quantization because"]
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
-    output = model.generate(**inputs, max_length=50)
-    text_output = tokenizer.batch_decode(output)
-    print(text_output)
-else:
-    print(
-        "WARNING: cannot perform sample generation of "
-        "deepseek models with transformers >= 4.48"
-    )
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml
deleted file mode 100644
index 23f276e2f..000000000
--- a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-quant_stage:
-  quant_modifiers:
-    GPTQModifier:
-      ignore: [lm_head, "re:.*mlp.gate$"]
-      config_groups:
-        group_0:
-          weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false}
-          targets: [Linear]
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseekv2_5_example.py
similarity index 76%
rename from examples/quantizing_moe/deepseek_moe_w8a8_int8.py
rename to examples/quantizing_moe/deepseekv2_5_example.py
index 3ec506c34..c2b3b0305 100644
--- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
+++ b/examples/quantizing_moe/deepseekv2_5_example.py
@@ -12,7 +12,7 @@
 # previous version or upgrading to a version where this bug is fixed
 
 # select a Mixture of Experts model for quantization
-MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
 
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
@@ -20,10 +20,9 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.
-# its recommended to use more calibration samples for MoE models so each expert is hit
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
 DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 2048
+NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
 
 
@@ -57,16 +56,12 @@ def tokenize(sample):
 
 ds = ds.map(tokenize, remove_columns=ds.column_names)
 
-# define a llmcompressor recipe for INT8 W8A8 quantization
+# Configure the quantization algorithm to run.
 # since the MoE gate layers are sensitive to quantization, we add them to the ignore
 # list so they remain at full precision
-recipe = [
-    GPTQModifier(
-        targets="Linear",
-        scheme="W8A8",
-        ignore=["lm_head", "re:.*mlp.gate$"],
-    ),
-]
+recipe = GPTQModifier(
+    targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"]
+)
 
 oneshot(
     model=model,
@@ -82,12 +77,10 @@ def tokenize(sample):
 if Version(__version__) < Version("4.48"):
     print("========== SAMPLE GENERATION ==============")
     dispatch_for_generation(model)
-    SAMPLE_INPUT = ["I love quantization because"]
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
-    output = model.generate(**inputs, max_length=50)
-    text_output = tokenizer.batch_decode(output)
-    print(text_output)
+    sample = tokenizer("Hello my name is", return_tensors="pt")
+    sample = {key: value.to("cuda") for key, value in sample.items()}
+    output = model.generate(**sample, max_new_tokens=100)
+    print(tokenizer.decode(output[0]))
     print("==========================================")
 else:
     print(
@@ -96,6 +89,6 @@ def tokenize(sample):
     )
 
 # Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseekv3_example.py b/examples/quantizing_moe/deepseekv3_example.py
index b34a9faa7..1b4c334ff 100644
--- a/examples/quantizing_moe/deepseekv3_example.py
+++ b/examples/quantizing_moe/deepseekv3_example.py
@@ -7,6 +7,8 @@
 from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
+# For DeepSeekv3, we require a full precision model in order to properly calibrate
+# `DeepSeek-V3-BF16` is a DeepSeek-V3 FP8 model which has been converted to BF16
 model_id = "RedHatAI/DeepSeek-V3-BF16"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -52,21 +54,22 @@ def tokenize(sample):
 ds = ds.map(tokenize, remove_columns=ds.column_names)
 
 # Configure the quantization algorithm to run.
-#   * quantize the weights to 4 bit with GPTQ with a group size 128
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
 recipe = GPTQModifier(
-    targets="Linear",
-    scheme="W4A16",
-    ignore=["lm_head"],
-    sequential_targets=["DeepseekV3Attention", "DeepseekV3MLP"],
+    targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"]
 )
 
 # Apply algorithms.
+# due to the large size of DeepSeekV3, we specify sequential targets such that
+# only one MLP is loaded into GPU memory at a time
 oneshot(
     model=model,
     dataset=ds,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    sequential_targets=["DeepseekV3Attention", "DeepseekV3MLP"],
 )
 
 # Confirm generations of the quantized model look sane.
diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
index a17bf873d..5021c7947 100644
--- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
@@ -1,56 +1,84 @@
-from typing import List
-
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.utils import dispatch_for_generation
 
+# select a Mixture of Experts model for quantization
 MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
-
-# Dataset config parameters
-DATASET_ID = "open_platypus"
-DATASET_SPLIT = "train"
-MAX_SEQ_LENGTH = 2048
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
 NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
 
-# Recipe
-layers_to_ignore: List[str] = [
-    "lm_head",
-    "re:.*block_sparse_moe.gate",  # does not quantize well
-]
-recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore)
+ds = ds.map(preprocess)
 
 
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# Configure the quantization algorithm to run.
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = QuantizationModifier(
+    scheme="FP8",
+    targets="Linear",
+    ignore=[
+        "lm_head",
+        "re:.*block_sparse_moe.gate",  # does not quantize well
+    ],
+)
+
 oneshot(
     model=model,
-    tokenizer=tokenizer,
-    dataset=DATASET_ID,
-    splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"},
+    dataset=ds,
     recipe=recipe,
-    max_seq_length=MAX_SEQ_LENGTH,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
 )
 
-# Confirm generations of the quantized model look sane.
-# Generation is broken for deepseek models when using the latest transformers package
-if Version(__version__) < Version("4.48"):
-    print("========== SAMPLE GENERATION ==============")
-    dispatch_for_generation(model)
-    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-    output = model.generate(input_ids, max_new_tokens=20)
-    print(tokenizer.decode(output[0]))
-    print("==========================================")
-else:
-    print(
-        "WARNING: cannot perform sample generation of "
-        "deepseek models with transformers >= 4.48"
-    )
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to("cuda") for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================")
 
 # Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py
index 40a78a9b7..2531e6528 100644
--- a/examples/quantizing_moe/qwen_moe_w4a16.py
+++ b/examples/quantizing_moe/qwen_moe_w4a16.py
@@ -73,12 +73,13 @@ def tokenize(sample):
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=20)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to("cuda") for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================")
 
 # Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-quantized.w4a16"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)

From 43bc91df08aa5c14e9cd7653fd3a65d52fe50c52 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 19 Jun 2025 17:21:19 -0400
Subject: [PATCH 05/22] remove deepseek2.5 for now

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../quantizing_moe/deepseekv2_5_example.py    | 94 -------------------
 1 file changed, 94 deletions(-)
 delete mode 100644 examples/quantizing_moe/deepseekv2_5_example.py

diff --git a/examples/quantizing_moe/deepseekv2_5_example.py b/examples/quantizing_moe/deepseekv2_5_example.py
deleted file mode 100644
index c2b3b0305..000000000
--- a/examples/quantizing_moe/deepseekv2_5_example.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import torch
-from datasets import load_dataset
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.utils import dispatch_for_generation
-
-# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
-# Please consider either downgrading your transformers version to a
-# previous version or upgrading to a version where this bug is fixed
-
-# select a Mixture of Experts model for quantization
-MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# Configure the quantization algorithm to run.
-# since the MoE gate layers are sensitive to quantization, we add them to the ignore
-# list so they remain at full precision
-recipe = GPTQModifier(
-    targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"]
-)
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    trust_remote_code_model=True,
-)
-
-# Confirm generations of the quantized model look sane.
-# Generation is broken for deepseek models when using the latest transformers package
-if Version(__version__) < Version("4.48"):
-    print("========== SAMPLE GENERATION ==============")
-    dispatch_for_generation(model)
-    sample = tokenizer("Hello my name is", return_tensors="pt")
-    sample = {key: value.to("cuda") for key, value in sample.items()}
-    output = model.generate(**sample, max_new_tokens=100)
-    print(tokenizer.decode(output[0]))
-    print("==========================================")
-else:
-    print(
-        "WARNING: cannot perform sample generation of "
-        "deepseek models with transformers >= 4.48"
-    )
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)

From 7d8ed369ae6abcdf7e1b8604c61a9d770f9b560f Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 19 Jun 2025 17:38:29 -0400
Subject: [PATCH 06/22] update readme

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 8f27ff9c6..66bb0a117 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@
 
 Big updates have landed in LLM Compressor! Check out these exciting new features:
 
+* **DeepSeekV3 and Sequential Onloading Support** As of llm-compressor>=0.6.0, you can now quantize DeepSeekV3 and other large models on a single GPU. Models are broken into disjoint layers which are then onloaded to the GPU one layer at a time. For more information on sequential onloading, see [Big Modeling with Sequential Onloading](examples/big_models_with_sequential_onloading/README.md) as well as the [DeepSeekV3 Example](examples/quantizing_moe/deepseekv3_example.py).
 * **Preliminary FP4 Quantization Support:** Quantize weights and activations to FP4 and seamlessly run the compressed model in vLLM. Model weights and activations are quantized following the NVFP4 [configuration](https://github.com/neuralmagic/compressed-tensors/blob/f5dbfc336b9c9c361b9fe7ae085d5cb0673e56eb/src/compressed_tensors/quantization/quant_scheme.py#L104). See examples of [weight-only quantization](examples/quantization_w4a16_fp4/llama3_example.py) and [fp4 activation support](examples/quantization_w4a4_fp4/llama3_example.py). Support is currently preliminary and additional support will be added for MoEs.
 * **Axolotl Sparse Finetuning Integration:** Seamlessly finetune sparse LLMs with our Axolotl integration. Learn how to create [fast sparse open-source models with Axolotl and LLM Compressor](https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open). See also the [Axolotl integration docs](https://docs.axolotl.ai/docs/custom_integrations.html#llmcompressor).
 * **AutoAWQ Integration:** Perform low-bit weight-only quantization efficiently using AutoAWQ, now part of LLM Compressor. *Note: This integration should be considered experimental for now. Enhanced support, including for MoE models and improved handling of larger models via layer sequential pipelining, is planned for upcoming releases.* [See the details](https://github.com/vllm-project/llm-compressor/pull/1177).

From b7273a90f859f84103dc7bde04ac9a8c4f2611cd Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 19 Jun 2025 18:23:25 -0400
Subject: [PATCH 07/22] infer model device with optional override

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../quantizing_moe/deepseekv2_5_example.py    | 94 +++++++++++++++++++
 src/llmcompressor/args/dataset_arguments.py   |  8 ++
 src/llmcompressor/pipelines/basic/pipeline.py |  6 +-
 .../pipelines/layer_sequential/helpers.py     |  3 +-
 .../pipelines/layer_sequential/pipeline.py    |  6 +-
 .../pipelines/sequential/pipeline.py          |  6 +-
 src/llmcompressor/utils/dev.py                | 28 +++++-
 7 files changed, 145 insertions(+), 6 deletions(-)
 create mode 100644 examples/quantizing_moe/deepseekv2_5_example.py

diff --git a/examples/quantizing_moe/deepseekv2_5_example.py b/examples/quantizing_moe/deepseekv2_5_example.py
new file mode 100644
index 000000000..c2b3b0305
--- /dev/null
+++ b/examples/quantizing_moe/deepseekv2_5_example.py
@@ -0,0 +1,94 @@
+import torch
+from datasets import load_dataset
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# Configure the quantization algorithm to run.
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = GPTQModifier(
+    targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"]
+)
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+)
+
+# Confirm generations of the quantized model look sane.
+# Generation is broken for deepseek models when using the latest transformers package
+if Version(__version__) < Version("4.48"):
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    sample = tokenizer("Hello my name is", return_tensors="pt")
+    sample = {key: value.to("cuda") for key, value in sample.items()}
+    output = model.generate(**sample, max_new_tokens=100)
+    print(tokenizer.decode(output[0]))
+    print("==========================================")
+else:
+    print(
+        "WARNING: cannot perform sample generation of "
+        "deepseek models with transformers >= 4.48"
+    )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
index 949933f97..f5e107b10 100644
--- a/src/llmcompressor/args/dataset_arguments.py
+++ b/src/llmcompressor/args/dataset_arguments.py
@@ -171,6 +171,7 @@ class DatasetArguments(CustomDatasetArguments):
             "will execute code present on the Hub on your local machine."
         },
     )
+    # --- pipeline arguments --- #
     pipeline: Optional[str] = field(
         default="independent",
         metadata={
@@ -196,3 +197,10 @@ class DatasetArguments(CustomDatasetArguments):
             "definition"
         },
     )
+    model_input_device: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Device to put model inputs on for calibration."
+            "If none is specified, the model input device is inferred from the model"
+        },
+    )
diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
index 605358ae9..ef192755e 100644
--- a/src/llmcompressor/pipelines/basic/pipeline.py
+++ b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -2,7 +2,6 @@
 
 import torch
 import tqdm
-from compressed_tensors.utils import get_execution_device
 from torch.utils.data.dataloader import DataLoader
 
 from llmcompressor.core import LifecycleCallbacks
@@ -10,6 +9,7 @@
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pytorch.utils.helpers import tensors_to_device
 from llmcompressor.utils import calibration_forward_context, dispatch_for_generation
+from llmcompressor.utils.dev import infer_model_device
 
 if TYPE_CHECKING:
     from llmcompressor.args.dataset_arguments import DatasetArguments
@@ -38,7 +38,9 @@ def __call__(
         :param dataset_args: dataset arguments relevant to pipelines
         """
         dispatch_for_generation(model)  # basic dispatch is identical to generation
-        model_device = get_execution_device(model)
+        model_device = dataset_args.model_input_device
+        if model_device is None:
+            model_device = infer_model_device(model)
 
         LifecycleCallbacks.calibration_epoch_start()
 
diff --git a/src/llmcompressor/pipelines/layer_sequential/helpers.py b/src/llmcompressor/pipelines/layer_sequential/helpers.py
index f9d828d14..c8b9fcbd3 100644
--- a/src/llmcompressor/pipelines/layer_sequential/helpers.py
+++ b/src/llmcompressor/pipelines/layer_sequential/helpers.py
@@ -44,6 +44,7 @@ def capture_first_layer_intermediates(
     model: Module,
     first_layer: Module,
     dataloader: DataLoader,
+    model_device: torch.device = torch.device("cpu"),
     mask_padding: bool = True,
 ) -> IntermediatesCache:
     """
@@ -68,7 +69,7 @@ def capture_first_layer_intermediates(
         desc = "Preparing intermediates cache"
         for batch_index, batch in enumerate(tqdm.tqdm(dataloader, desc=desc)):
             batch = apply_pad_mask_to_batch(batch) if mask_padding else batch
-            batch = tensors_to_device(batch, torch.device("cpu"))
+            batch = tensors_to_device(batch, model_device)
 
             try:
                 model(**batch)
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
index d8ad73a10..95e4a2fea 100644
--- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -19,6 +19,7 @@
     dispatch_for_sequential,
     get_sequential_targets,
 )
+from llmcompressor.utils.dev import infer_model_device
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
 if TYPE_CHECKING:
@@ -60,6 +61,9 @@ def __call__(
 
         # prepare model for sequential onloading
         dispatch_for_sequential(model)
+        model_device = dataset_args.model_input_device
+        if model_device is None:
+            model_device = infer_model_device(model)
 
         # find layers
         modifiers = session.get_modifiers()
@@ -71,7 +75,7 @@ def __call__(
         with calibration_forward_context(model), DisableQuantization(model):
             # prepare intermediates cache
             intermediates: IntermediatesCache = capture_first_layer_intermediates(
-                model, layers[0], dataloader
+                model, layers[0], dataloader, model_device
             )
 
             num_layers = len(layers)
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
index 8cefeb0cf..d8ae6661a 100644
--- a/src/llmcompressor/pipelines/sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -14,6 +14,7 @@
     get_sequential_targets,
     trace_subgraphs,
 )
+from llmcompressor.utils.dev import infer_model_device
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
 if TYPE_CHECKING:
@@ -54,6 +55,9 @@ def __call__(
 
         # prepare model for sequential onloading
         dispatch_for_sequential(model)
+        model_device = dataset_args.model_input_device
+        if model_device is None:
+            model_device = infer_model_device(model)
 
         # prepare to trace subgraphs
         modifiers = session.get_modifiers()
@@ -69,7 +73,7 @@ def __call__(
 
         with calibration_forward_context(model), DisableQuantization(model):
             # prepare intermediates cache
-            activations = IntermediatesCache.from_dataloader(dataloader)
+            activations = IntermediatesCache.from_dataloader(dataloader, model_device)
 
             for subgraph_index, subgraph in enumerate(subgraphs):
                 # prepare tqdm description texts
diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py
index f0feb6c04..b44e76739 100644
--- a/src/llmcompressor/utils/dev.py
+++ b/src/llmcompressor/utils/dev.py
@@ -7,7 +7,7 @@
 import torch
 from accelerate import dispatch_model, infer_auto_device_map
 from accelerate.utils import get_balanced_memory
-from compressed_tensors.utils import remove_dispatch
+from compressed_tensors.utils import has_offloaded_params, remove_dispatch
 from huggingface_hub import snapshot_download
 from safetensors.torch import save_file
 from transformers import AutoModelForCausalLM, PreTrainedModel
@@ -20,6 +20,7 @@
     "skip_weights_download",
     "patch_transformers_logger_level",
     "dispatch_for_generation",
+    "infer_model_device",
 ]
 
 
@@ -140,3 +141,28 @@ def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel:
     )
 
     return dispatch_model(model, device_map=device_map)
+
+
+def infer_model_device(model: PreTrainedModel) -> torch.device:
+    """
+    Gets the model's execution device (the device that model inputs should be on)
+    using non-guaranteed but reasonable assumptions about module and parameter order.
+
+    If a model is offloaded, assume that modules execute in the same order
+    that they are returned by torch.nn.Module.modules()
+
+    If a model is not offloaded, assume that parameters are used in the same order
+    that they are used
+
+    :param model: model whose execution device is being inferred
+    :return: device which model inputs should be put on
+    """
+    for module in model.modules():
+        if has_offloaded_params(module):
+            return module._hf_hook.execution_device
+
+    first_param = next(module.parameters(), None)
+    if first_param is None:
+        return torch.device("cpu")
+
+    return first_param.device

From afebe2e953364232ab745cd0fc95ee0ca7d9b999 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 20 Jun 2025 10:43:23 -0400
Subject: [PATCH 08/22] handle nullable dataset_args

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/pipelines/basic/pipeline.py            | 2 +-
 src/llmcompressor/pipelines/layer_sequential/pipeline.py | 2 +-
 src/llmcompressor/pipelines/sequential/pipeline.py       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
index ef192755e..87f463bfe 100644
--- a/src/llmcompressor/pipelines/basic/pipeline.py
+++ b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -38,7 +38,7 @@ def __call__(
         :param dataset_args: dataset arguments relevant to pipelines
         """
         dispatch_for_generation(model)  # basic dispatch is identical to generation
-        model_device = dataset_args.model_input_device
+        model_device = getattr(dataset_args, "model_input_device")
         if model_device is None:
             model_device = infer_model_device(model)
 
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
index 95e4a2fea..089f7dc8c 100644
--- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -61,7 +61,7 @@ def __call__(
 
         # prepare model for sequential onloading
         dispatch_for_sequential(model)
-        model_device = dataset_args.model_input_device
+        model_device = getattr(dataset_args, "model_input_device")
         if model_device is None:
             model_device = infer_model_device(model)
 
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
index d8ae6661a..c9ce9cd30 100644
--- a/src/llmcompressor/pipelines/sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -55,7 +55,7 @@ def __call__(
 
         # prepare model for sequential onloading
         dispatch_for_sequential(model)
-        model_device = dataset_args.model_input_device
+        model_device = getattr(dataset_args, "model_input_device")
         if model_device is None:
             model_device = infer_model_device(model)
 

From ab3aa3e85ebcb8d4a5a7e8e2bd645c93616baf03 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 20 Jun 2025 10:46:54 -0400
Subject: [PATCH 09/22] update docstrings, comments

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/args/dataset_arguments.py | 2 +-
 src/llmcompressor/utils/dev.py              | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
index f5e107b10..677d09daa 100644
--- a/src/llmcompressor/args/dataset_arguments.py
+++ b/src/llmcompressor/args/dataset_arguments.py
@@ -200,7 +200,7 @@ class DatasetArguments(CustomDatasetArguments):
     model_input_device: Optional[str] = field(
         default=None,
         metadata={
-            "help": "Device to put model inputs on for calibration."
+            "help": "Device to put model inputs on for calibration. "
             "If none is specified, the model input device is inferred from the model"
         },
     )
diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py
index b44e76739..558ef816e 100644
--- a/src/llmcompressor/utils/dev.py
+++ b/src/llmcompressor/utils/dev.py
@@ -149,10 +149,10 @@ def infer_model_device(model: PreTrainedModel) -> torch.device:
     using non-guaranteed but reasonable assumptions about module and parameter order.
 
     If a model is offloaded, assume that modules execute in the same order
-    that they are returned by torch.nn.Module.modules()
+    that they are returned by `model.modules()`
 
     If a model is not offloaded, assume that parameters are used in the same order
-    that they are used
+    that they are returned by `model.parameters()`
 
     :param model: model whose execution device is being inferred
     :return: device which model inputs should be put on
@@ -161,7 +161,7 @@ def infer_model_device(model: PreTrainedModel) -> torch.device:
         if has_offloaded_params(module):
             return module._hf_hook.execution_device
 
-    first_param = next(module.parameters(), None)
+    first_param = next(model.parameters(), None)
     if first_param is None:
         return torch.device("cpu")
 

From e9e30c3e6580ba171ba40158ab57746fe601ae90 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 20 Jun 2025 13:37:55 -0400
Subject: [PATCH 10/22] rename files, update examples tests

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../{mixtral_moe_w8a8_fp8.py => mixtral_example.py}   |  4 ++--
 .../{qwen_moe_w4a16.py => qwen_example.py}            |  0
 tests/examples/test_quantizing_moe.py                 | 11 ++++-------
 3 files changed, 6 insertions(+), 9 deletions(-)
 rename examples/quantizing_moe/{mixtral_moe_w8a8_fp8.py => mixtral_example.py} (96%)
 rename examples/quantizing_moe/{qwen_moe_w4a16.py => qwen_example.py} (100%)

diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_example.py
similarity index 96%
rename from examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
rename to examples/quantizing_moe/mixtral_example.py
index 5021c7947..49b08c722 100644
--- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/mixtral_example.py
@@ -55,7 +55,7 @@ def tokenize(sample):
 # since the MoE gate layers are sensitive to quantization, we add them to the ignore
 # list so they remain at full precision
 recipe = QuantizationModifier(
-    scheme="FP8",
+    scheme="W4A16",
     targets="Linear",
     ignore=[
         "lm_head",
@@ -81,6 +81,6 @@ def tokenize(sample):
 print("==========================================")
 
 # Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_example.py
similarity index 100%
rename from examples/quantizing_moe/qwen_moe_w4a16.py
rename to examples/quantizing_moe/qwen_example.py
diff --git a/tests/examples/test_quantizing_moe.py b/tests/examples/test_quantizing_moe.py
index 49686d25c..50e86c2c8 100644
--- a/tests/examples/test_quantizing_moe.py
+++ b/tests/examples/test_quantizing_moe.py
@@ -44,14 +44,11 @@ def test_doc_example_command(self, example_dir: str, tmp_path: Path):
         "script_filename",
         [
             pytest.param(
-                "deepseek_moe_w4a16.py",
-                marks=[
-                    pytest.mark.multi_gpu,
-                    pytest.mark.skip(reason="exceptionally long run time"),
-                ],
+                "deepseekv3_example.py",
+                marks=pytest.mark.skip(reason="exceptionally long run time"),
             ),
-            pytest.param("deepseek_moe_w8a8_fp8.py"),
-            pytest.param("deepseek_moe_w8a8_int8.py", marks=pytest.mark.multi_gpu),
+            pytest.param("mixtral_example.py"),
+            pytest.param("qwen_example.py"),
         ],
     )
     def test_deepseek_example_script(

From 6bf5acbc9c06b7e440012f7b0b09576ffbe3a05f Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 20 Jun 2025 13:42:12 -0400
Subject: [PATCH 11/22] rebase on main

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantizing_moe/deepseek_moe_w4a16.py | 125 ++++++++++++++++++
 .../quantizing_moe/deepseek_moe_w8a8_fp8.py   |  99 ++++++++++++++
 .../quantizing_moe/deepseek_moe_w8a8_int8.py  | 101 ++++++++++++++
 .../quantizing_moe/deepseek_recipe_w4a16.yaml |   8 ++
 .../quantizing_moe/mixtral_moe_w8a8_fp8.py    |  96 +++++---------
 examples/quantizing_moe/qwen_moe_w4a16.py     |   7 +-
 src/llmcompressor/modeling/__init__.py        |   3 -
 src/llmcompressor/modeling/deepseek_v3.py     |  48 -------
 src/llmcompressor/modeling/prepare.py         |  22 ---
 src/llmcompressor/utils/module.py             |  27 ----
 10 files changed, 370 insertions(+), 166 deletions(-)
 create mode 100644 examples/quantizing_moe/deepseek_moe_w4a16.py
 create mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
 create mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_int8.py
 create mode 100644 examples/quantizing_moe/deepseek_recipe_w4a16.yaml
 delete mode 100644 src/llmcompressor/modeling/__init__.py
 delete mode 100644 src/llmcompressor/modeling/deepseek_v3.py
 delete mode 100644 src/llmcompressor/modeling/prepare.py
 delete mode 100644 src/llmcompressor/utils/module.py

diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
new file mode 100644
index 000000000..9880e9248
--- /dev/null
+++ b/examples/quantizing_moe/deepseek_moe_w4a16.py
@@ -0,0 +1,125 @@
+import torch
+from datasets import load_dataset
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+
+from llmcompressor import oneshot
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# define a llmcompressor recipe for W416 quantization
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = "deepseek_recipe_w4a16.yaml"
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    save_compressed=True,
+    trust_remote_code_model=True,
+)
+
+# Confirm generations of the quantized model look sane.
+# Generation is broken for deepseek models when using the latest transformers package
+if Version(__version__) < Version("4.48"):
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+    output = model.generate(input_ids, max_new_tokens=20)
+    print(tokenizer.decode(output[0]))
+    print("==========================================")
+else:
+    print(
+        "WARNING: cannot perform sample generation of "
+        "deepseek models with transformers >= 4.48"
+    )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+
+# Run the model on vLLM
+try:
+    from vllm import LLM, SamplingParams
+
+    vllm_installed = True
+except ImportError:
+    vllm_installed = False
+
+if vllm_installed:
+    print("vLLM installed, running using vLLM")
+    sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
+    llm = LLM(
+        model=SAVE_DIR,
+        tensor_parallel_size=2,
+        trust_remote_code=True,
+        max_model_len=1042,
+        dtype=torch.half,
+    )
+    prompts = [
+        "The capital of France is",
+        "The president of the US is",
+        "My name is",
+    ]
+
+    outputs = llm.generate(prompts, sampling_params)
+    print("================= vLLM GENERATION ======================")
+    for output in outputs:
+        assert output
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print("PROMPT", prompt)
+        print("GENERATED TEXT", generated_text)
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
new file mode 100644
index 000000000..0bc9c24df
--- /dev/null
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
@@ -0,0 +1,99 @@
+from datasets import load_dataset
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype="auto", trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+# its recommended to use more calibration samples for MoE models so each expert is hit
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 2048
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# define a llmcompressor recipe for FP8 W8A8 quantization
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = [
+    QuantizationModifier(
+        targets="Linear",
+        scheme="FP8",
+        ignore=["lm_head", "re:.*mlp.gate$"],
+    ),
+]
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+)
+
+# Confirm generations of the quantized model look sane.
+# Generation is broken for deepseek models when using the latest transformers package
+if Version(__version__) < Version("4.48"):
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    SAMPLE_INPUT = ["I love quantization because"]
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
+    output = model.generate(**inputs, max_length=50)
+    text_output = tokenizer.batch_decode(output)
+    print(text_output)
+else:
+    print(
+        "WARNING: cannot perform sample generation of "
+        "deepseek models with transformers >= 4.48"
+    )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
new file mode 100644
index 000000000..3ec506c34
--- /dev/null
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
@@ -0,0 +1,101 @@
+import torch
+from datasets import load_dataset
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+# its recommended to use more calibration samples for MoE models so each expert is hit
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 2048
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# define a llmcompressor recipe for INT8 W8A8 quantization
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = [
+    GPTQModifier(
+        targets="Linear",
+        scheme="W8A8",
+        ignore=["lm_head", "re:.*mlp.gate$"],
+    ),
+]
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+)
+
+# Confirm generations of the quantized model look sane.
+# Generation is broken for deepseek models when using the latest transformers package
+if Version(__version__) < Version("4.48"):
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    SAMPLE_INPUT = ["I love quantization because"]
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
+    output = model.generate(**inputs, max_length=50)
+    text_output = tokenizer.batch_decode(output)
+    print(text_output)
+    print("==========================================")
+else:
+    print(
+        "WARNING: cannot perform sample generation of "
+        "deepseek models with transformers >= 4.48"
+    )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml
new file mode 100644
index 000000000..23f276e2f
--- /dev/null
+++ b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml
@@ -0,0 +1,8 @@
+quant_stage:
+  quant_modifiers:
+    GPTQModifier:
+      ignore: [lm_head, "re:.*mlp.gate$"]
+      config_groups:
+        group_0:
+          weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false}
+          targets: [Linear]
diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
index 5021c7947..a17bf873d 100644
--- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
@@ -1,84 +1,56 @@
-import torch
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from typing import List
+
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.utils import dispatch_for_generation
 
-# select a Mixture of Experts model for quantization
 MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
-)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
-# Select calibration dataset.
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
 
+# Dataset config parameters
+DATASET_ID = "open_platypus"
+DATASET_SPLIT = "train"
+MAX_SEQ_LENGTH = 2048
+NUM_CALIBRATION_SAMPLES = 512
 
-ds = ds.map(tokenize, remove_columns=ds.column_names)
+# Recipe
+layers_to_ignore: List[str] = [
+    "lm_head",
+    "re:.*block_sparse_moe.gate",  # does not quantize well
+]
+recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore)
 
-# Configure the quantization algorithm to run.
-# since the MoE gate layers are sensitive to quantization, we add them to the ignore
-# list so they remain at full precision
-recipe = QuantizationModifier(
-    scheme="FP8",
-    targets="Linear",
-    ignore=[
-        "lm_head",
-        "re:.*block_sparse_moe.gate",  # does not quantize well
-    ],
-)
 
 oneshot(
     model=model,
-    dataset=ds,
+    tokenizer=tokenizer,
+    dataset=DATASET_ID,
+    splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"},
     recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
+    max_seq_length=MAX_SEQ_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    trust_remote_code_model=True,
 )
 
-print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
-sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to("cuda") for key, value in sample.items()}
-output = model.generate(**sample, max_new_tokens=100)
-print(tokenizer.decode(output[0]))
-print("==========================================")
+# Confirm generations of the quantized model look sane.
+# Generation is broken for deepseek models when using the latest transformers package
+if Version(__version__) < Version("4.48"):
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+    output = model.generate(input_ids, max_new_tokens=20)
+    print(tokenizer.decode(output[0]))
+    print("==========================================")
+else:
+    print(
+        "WARNING: cannot perform sample generation of "
+        "deepseek models with transformers >= 4.48"
+    )
 
 # Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py
index 2531e6528..40a78a9b7 100644
--- a/examples/quantizing_moe/qwen_moe_w4a16.py
+++ b/examples/quantizing_moe/qwen_moe_w4a16.py
@@ -73,13 +73,12 @@ def tokenize(sample):
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to("cuda") for key, value in sample.items()}
-output = model.generate(**sample, max_new_tokens=100)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=20)
 print(tokenizer.decode(output[0]))
 print("==========================================")
 
 # Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-quantized.w4a16"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py
deleted file mode 100644
index e2c22ed1f..000000000
--- a/src/llmcompressor/modeling/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# flake8: noqa
-
-from .prepare import *
diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py
deleted file mode 100644
index 4b885ff64..000000000
--- a/src/llmcompressor/modeling/deepseek_v3.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import torch
-from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE
-
-
-class DeepseekV3MoECalibrate(torch.nn.Module):
-    def __init__(self, config, experts, gate, shared_experts):
-        super().__init__()
-        self.config = config
-        self.experts = experts
-        self.gate = gate
-        self.shared_experts = shared_experts
-
-    def forward(self, hidden_states):
-        residuals = hidden_states
-        orig_shape = hidden_states.shape
-        topk_indices, topk_weights = self.gate(hidden_states)
-        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-
-        # Begin MoE
-        final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
-        expert_mask = torch.nn.functional.one_hot(
-            topk_indices, num_classes=len(self.experts)
-        )
-        expert_mask = expert_mask.permute(2, 0, 1)
-
-        for expert_idx in range(len(self.experts)):
-            expert = self.experts[expert_idx]
-            mask = expert_mask[expert_idx]
-            token_indices, weight_indices = torch.where(mask)
-
-            expert_weights = topk_weights[token_indices, weight_indices]
-            expert_input = hidden_states[token_indices]
-            expert_output = expert(expert_input)
-            weighted_output = expert_output * expert_weights.unsqueeze(-1)
-
-            if token_indices.numel() > 0:
-                final_hidden_states.index_add_(0, token_indices, weighted_output)
-        # End MoE
-
-        hidden_states = final_hidden_states.type(hidden_states.dtype).view(*orig_shape)
-        hidden_states = hidden_states + self.shared_experts(residuals)
-        return hidden_states
-
-
-def replace(module: DeepseekV3MoE) -> DeepseekV3MoECalibrate:
-    return DeepseekV3MoECalibrate(
-        module.config, module.experts, module.gate, module.shared_experts
-    )
diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py
deleted file mode 100644
index a8dedf8ee..000000000
--- a/src/llmcompressor/modeling/prepare.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import torch
-from transformers import PreTrainedModel
-from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE
-
-from llmcompressor.modeling.deepseek_v3 import replace as replace_DeepseekV3MoE
-from llmcompressor.utils.module import module_bfs
-
-__all__ = ["prepare_for_quantization"]
-
-replacements = {
-    DeepseekV3MoE: replace_DeepseekV3MoE,
-}
-
-
-def prepare_for_quantization(model: PreTrainedModel) -> PreTrainedModel:
-    def replace(module: torch.nn.Module) -> torch.nn.Module:
-        if module.__class__ in replacements:
-            return replacements[module.__class__](module)
-        else:
-            return module
-
-    return module_bfs(model, replace, progress=True)
diff --git a/src/llmcompressor/utils/module.py b/src/llmcompressor/utils/module.py
deleted file mode 100644
index a02aa8b4a..000000000
--- a/src/llmcompressor/utils/module.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from typing import Callable, Union
-
-import torch
-import tqdm
-
-__all__ = ["module_bfs"]
-
-
-def module_bfs(
-    module: torch.nn.Module,
-    func: Callable[[torch.nn.Module], torch.nn.Module],
-    pre: bool = True,
-    progress: Union[bool, tqdm.tqdm] = False,
-) -> torch.nn.Module:
-    if progress is True:
-        total = len(list(module.modules()))
-        progress = tqdm.tqdm(total=total)
-    if pre:
-        module = func(module)
-    for name, child in list(module.named_children()):
-        module.add_module(name, module_bfs(child, func, pre, progress))
-    if not pre:
-        module = func(module)
-    if isinstance(progress, tqdm.tqdm):
-        progress.update(1)
-
-    return module

From e77a31bbf9b0c1a0a75613dc222ca456db8867df Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 20 Jun 2025 13:49:31 -0400
Subject: [PATCH 12/22] clean examples

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantizing_moe/deepseek_moe_w4a16.py | 125 ------------------
 .../quantizing_moe/deepseek_moe_w8a8_fp8.py   |  99 --------------
 .../quantizing_moe/deepseek_moe_w8a8_int8.py  | 101 --------------
 .../quantizing_moe/deepseek_recipe_w4a16.yaml |   8 --
 ...epseekv3_example.py => mixtral_example.py} |  46 +++----
 .../quantizing_moe/mixtral_moe_w8a8_fp8.py    |  58 --------
 .../{qwen_moe_w4a16.py => qwen_example.py}    |   7 +-
 tests/examples/test_quantizing_moe.py         |  11 +-
 8 files changed, 30 insertions(+), 425 deletions(-)
 delete mode 100644 examples/quantizing_moe/deepseek_moe_w4a16.py
 delete mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
 delete mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_int8.py
 delete mode 100644 examples/quantizing_moe/deepseek_recipe_w4a16.yaml
 rename examples/quantizing_moe/{deepseekv3_example.py => mixtral_example.py} (57%)
 delete mode 100644 examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
 rename examples/quantizing_moe/{qwen_moe_w4a16.py => qwen_example.py} (90%)

diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
deleted file mode 100644
index 9880e9248..000000000
--- a/examples/quantizing_moe/deepseek_moe_w4a16.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import torch
-from datasets import load_dataset
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
-
-from llmcompressor import oneshot
-from llmcompressor.utils import dispatch_for_generation
-
-# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
-# Please consider either downgrading your transformers version to a
-# previous version or upgrading to a version where this bug is fixed
-
-# select a Mixture of Experts model for quantization
-MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# define a llmcompressor recipe for W416 quantization
-# since the MoE gate layers are sensitive to quantization, we add them to the ignore
-# list so they remain at full precision
-recipe = "deepseek_recipe_w4a16.yaml"
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    save_compressed=True,
-    trust_remote_code_model=True,
-)
-
-# Confirm generations of the quantized model look sane.
-# Generation is broken for deepseek models when using the latest transformers package
-if Version(__version__) < Version("4.48"):
-    print("========== SAMPLE GENERATION ==============")
-    dispatch_for_generation(model)
-    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-    output = model.generate(input_ids, max_new_tokens=20)
-    print(tokenizer.decode(output[0]))
-    print("==========================================")
-else:
-    print(
-        "WARNING: cannot perform sample generation of "
-        "deepseek models with transformers >= 4.48"
-    )
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-
-# Run the model on vLLM
-try:
-    from vllm import LLM, SamplingParams
-
-    vllm_installed = True
-except ImportError:
-    vllm_installed = False
-
-if vllm_installed:
-    print("vLLM installed, running using vLLM")
-    sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
-    llm = LLM(
-        model=SAVE_DIR,
-        tensor_parallel_size=2,
-        trust_remote_code=True,
-        max_model_len=1042,
-        dtype=torch.half,
-    )
-    prompts = [
-        "The capital of France is",
-        "The president of the US is",
-        "My name is",
-    ]
-
-    outputs = llm.generate(prompts, sampling_params)
-    print("================= vLLM GENERATION ======================")
-    for output in outputs:
-        assert output
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print("PROMPT", prompt)
-        print("GENERATED TEXT", generated_text)
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
deleted file mode 100644
index 0bc9c24df..000000000
--- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
+++ /dev/null
@@ -1,99 +0,0 @@
-from datasets import load_dataset
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils import dispatch_for_generation
-
-# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
-# Please consider either downgrading your transformers version to a
-# previous version or upgrading to a version where this bug is fixed
-
-# select a Mixture of Experts model for quantization
-MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype="auto", trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-# its recommended to use more calibration samples for MoE models so each expert is hit
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 2048
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# define a llmcompressor recipe for FP8 W8A8 quantization
-# since the MoE gate layers are sensitive to quantization, we add them to the ignore
-# list so they remain at full precision
-recipe = [
-    QuantizationModifier(
-        targets="Linear",
-        scheme="FP8",
-        ignore=["lm_head", "re:.*mlp.gate$"],
-    ),
-]
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    trust_remote_code_model=True,
-)
-
-# Confirm generations of the quantized model look sane.
-# Generation is broken for deepseek models when using the latest transformers package
-if Version(__version__) < Version("4.48"):
-    print("========== SAMPLE GENERATION ==============")
-    dispatch_for_generation(model)
-    SAMPLE_INPUT = ["I love quantization because"]
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
-    output = model.generate(**inputs, max_length=50)
-    text_output = tokenizer.batch_decode(output)
-    print(text_output)
-else:
-    print(
-        "WARNING: cannot perform sample generation of "
-        "deepseek models with transformers >= 4.48"
-    )
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
deleted file mode 100644
index 3ec506c34..000000000
--- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import torch
-from datasets import load_dataset
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.utils import dispatch_for_generation
-
-# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
-# Please consider either downgrading your transformers version to a
-# previous version or upgrading to a version where this bug is fixed
-
-# select a Mixture of Experts model for quantization
-MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-# its recommended to use more calibration samples for MoE models so each expert is hit
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 2048
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# define a llmcompressor recipe for INT8 W8A8 quantization
-# since the MoE gate layers are sensitive to quantization, we add them to the ignore
-# list so they remain at full precision
-recipe = [
-    GPTQModifier(
-        targets="Linear",
-        scheme="W8A8",
-        ignore=["lm_head", "re:.*mlp.gate$"],
-    ),
-]
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    trust_remote_code_model=True,
-)
-
-# Confirm generations of the quantized model look sane.
-# Generation is broken for deepseek models when using the latest transformers package
-if Version(__version__) < Version("4.48"):
-    print("========== SAMPLE GENERATION ==============")
-    dispatch_for_generation(model)
-    SAMPLE_INPUT = ["I love quantization because"]
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
-    output = model.generate(**inputs, max_length=50)
-    text_output = tokenizer.batch_decode(output)
-    print(text_output)
-    print("==========================================")
-else:
-    print(
-        "WARNING: cannot perform sample generation of "
-        "deepseek models with transformers >= 4.48"
-    )
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml
deleted file mode 100644
index 23f276e2f..000000000
--- a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-quant_stage:
-  quant_modifiers:
-    GPTQModifier:
-      ignore: [lm_head, "re:.*mlp.gate$"]
-      config_groups:
-        group_0:
-          weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false}
-          targets: [Linear]
diff --git a/examples/quantizing_moe/deepseekv3_example.py b/examples/quantizing_moe/mixtral_example.py
similarity index 57%
rename from examples/quantizing_moe/deepseekv3_example.py
rename to examples/quantizing_moe/mixtral_example.py
index 1b4c334ff..49b08c722 100644
--- a/examples/quantizing_moe/deepseekv3_example.py
+++ b/examples/quantizing_moe/mixtral_example.py
@@ -1,28 +1,26 @@
+import torch
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor.modeling import prepare_for_quantization
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.utils import dispatch_for_generation
 
-# Select model and load it.
-# For DeepSeekv3, we require a full precision model in order to properly calibrate
-# `DeepSeek-V3-BF16` is a DeepSeek-V3 FP8 model which has been converted to BF16
-model_id = "RedHatAI/DeepSeek-V3-BF16"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = prepare_for_quantization(model)
+# select a Mixture of Experts model for quantization
+MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
 DATASET_SPLIT = "train_sft"
-
-# Select number of samples. 512 samples is a good place to start.
-# Increasing the number of samples can improve accuracy.
 NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
 
+
 # Load dataset and preprocess.
 ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
 ds = ds.shuffle(seed=42)
@@ -56,33 +54,33 @@ def tokenize(sample):
 # Configure the quantization algorithm to run.
 # since the MoE gate layers are sensitive to quantization, we add them to the ignore
 # list so they remain at full precision
-recipe = GPTQModifier(
-    targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"]
+recipe = QuantizationModifier(
+    scheme="W4A16",
+    targets="Linear",
+    ignore=[
+        "lm_head",
+        "re:.*block_sparse_moe.gate",  # does not quantize well
+    ],
 )
 
-# Apply algorithms.
-# due to the large size of DeepSeekV3, we specify sequential targets such that
-# only one MLP is loaded into GPU memory at a time
 oneshot(
     model=model,
     dataset=ds,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    sequential_targets=["DeepseekV3Attention", "DeepseekV3MLP"],
+    trust_remote_code_model=True,
 )
 
-# Confirm generations of the quantized model look sane.
-print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
 sample = tokenizer("Hello my name is", return_tensors="pt")
 sample = {key: value.to("cuda") for key, value in sample.items()}
 output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
-print("==========================================\n\n")
+print("==========================================")
 
-# Save to disk compressed.
-SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
deleted file mode 100644
index a17bf873d..000000000
--- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from typing import List
-
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils import dispatch_for_generation
-
-MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-
-# Dataset config parameters
-DATASET_ID = "open_platypus"
-DATASET_SPLIT = "train"
-MAX_SEQ_LENGTH = 2048
-NUM_CALIBRATION_SAMPLES = 512
-
-# Recipe
-layers_to_ignore: List[str] = [
-    "lm_head",
-    "re:.*block_sparse_moe.gate",  # does not quantize well
-]
-recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore)
-
-
-oneshot(
-    model=model,
-    tokenizer=tokenizer,
-    dataset=DATASET_ID,
-    splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"},
-    recipe=recipe,
-    max_seq_length=MAX_SEQ_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-)
-
-# Confirm generations of the quantized model look sane.
-# Generation is broken for deepseek models when using the latest transformers package
-if Version(__version__) < Version("4.48"):
-    print("========== SAMPLE GENERATION ==============")
-    dispatch_for_generation(model)
-    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-    output = model.generate(input_ids, max_new_tokens=20)
-    print(tokenizer.decode(output[0]))
-    print("==========================================")
-else:
-    print(
-        "WARNING: cannot perform sample generation of "
-        "deepseek models with transformers >= 4.48"
-    )
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_example.py
similarity index 90%
rename from examples/quantizing_moe/qwen_moe_w4a16.py
rename to examples/quantizing_moe/qwen_example.py
index 40a78a9b7..2531e6528 100644
--- a/examples/quantizing_moe/qwen_moe_w4a16.py
+++ b/examples/quantizing_moe/qwen_example.py
@@ -73,12 +73,13 @@ def tokenize(sample):
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=20)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to("cuda") for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================")
 
 # Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-quantized.w4a16"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/tests/examples/test_quantizing_moe.py b/tests/examples/test_quantizing_moe.py
index 49686d25c..848d5e3bf 100644
--- a/tests/examples/test_quantizing_moe.py
+++ b/tests/examples/test_quantizing_moe.py
@@ -44,14 +44,11 @@ def test_doc_example_command(self, example_dir: str, tmp_path: Path):
         "script_filename",
         [
             pytest.param(
-                "deepseek_moe_w4a16.py",
-                marks=[
-                    pytest.mark.multi_gpu,
-                    pytest.mark.skip(reason="exceptionally long run time"),
-                ],
+                "deepseekv2_5_example.py",
+                marks=pytest.mark.skip(reason="exceptionally long run time"),
             ),
-            pytest.param("deepseek_moe_w8a8_fp8.py"),
-            pytest.param("deepseek_moe_w8a8_int8.py", marks=pytest.mark.multi_gpu),
+            pytest.param("mixtral_example.py"),
+            pytest.param("qwen_example.py"),
         ],
     )
     def test_deepseek_example_script(

From 366ac257c801d29427154200a9aa3cb4f6be5080 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 20 Jun 2025 13:51:36 -0400
Subject: [PATCH 13/22] revert examples changes

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantizing_moe/deepseek_moe_w4a16.py | 125 ++++++++++++++++++
 .../quantizing_moe/deepseek_moe_w8a8_fp8.py   |  99 ++++++++++++++
 .../quantizing_moe/deepseek_moe_w8a8_int8.py  | 101 ++++++++++++++
 .../quantizing_moe/deepseek_recipe_w4a16.yaml |   8 ++
 .../quantizing_moe/mixtral_moe_w8a8_fp8.py    |  58 ++++++++
 examples/quantizing_moe/qwen_moe_w4a16.py     |  84 ++++++++++++
 6 files changed, 475 insertions(+)
 create mode 100644 examples/quantizing_moe/deepseek_moe_w4a16.py
 create mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
 create mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_int8.py
 create mode 100644 examples/quantizing_moe/deepseek_recipe_w4a16.yaml
 create mode 100644 examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
 create mode 100644 examples/quantizing_moe/qwen_moe_w4a16.py

diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
new file mode 100644
index 000000000..9880e9248
--- /dev/null
+++ b/examples/quantizing_moe/deepseek_moe_w4a16.py
@@ -0,0 +1,125 @@
+import torch
+from datasets import load_dataset
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+
+from llmcompressor import oneshot
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# define a llmcompressor recipe for W416 quantization
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = "deepseek_recipe_w4a16.yaml"
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    save_compressed=True,
+    trust_remote_code_model=True,
+)
+
+# Confirm generations of the quantized model look sane.
+# Generation is broken for deepseek models when using the latest transformers package
+if Version(__version__) < Version("4.48"):
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+    output = model.generate(input_ids, max_new_tokens=20)
+    print(tokenizer.decode(output[0]))
+    print("==========================================")
+else:
+    print(
+        "WARNING: cannot perform sample generation of "
+        "deepseek models with transformers >= 4.48"
+    )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+
+# Run the model on vLLM
+try:
+    from vllm import LLM, SamplingParams
+
+    vllm_installed = True
+except ImportError:
+    vllm_installed = False
+
+if vllm_installed:
+    print("vLLM installed, running using vLLM")
+    sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
+    llm = LLM(
+        model=SAVE_DIR,
+        tensor_parallel_size=2,
+        trust_remote_code=True,
+        max_model_len=1042,
+        dtype=torch.half,
+    )
+    prompts = [
+        "The capital of France is",
+        "The president of the US is",
+        "My name is",
+    ]
+
+    outputs = llm.generate(prompts, sampling_params)
+    print("================= vLLM GENERATION ======================")
+    for output in outputs:
+        assert output
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print("PROMPT", prompt)
+        print("GENERATED TEXT", generated_text)
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
new file mode 100644
index 000000000..0bc9c24df
--- /dev/null
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
@@ -0,0 +1,99 @@
+from datasets import load_dataset
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype="auto", trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+# its recommended to use more calibration samples for MoE models so each expert is hit
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 2048
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# define a llmcompressor recipe for FP8 W8A8 quantization
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = [
+    QuantizationModifier(
+        targets="Linear",
+        scheme="FP8",
+        ignore=["lm_head", "re:.*mlp.gate$"],
+    ),
+]
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+)
+
+# Confirm generations of the quantized model look sane.
+# Generation is broken for deepseek models when using the latest transformers package
+if Version(__version__) < Version("4.48"):
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    SAMPLE_INPUT = ["I love quantization because"]
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
+    output = model.generate(**inputs, max_length=50)
+    text_output = tokenizer.batch_decode(output)
+    print(text_output)
+else:
+    print(
+        "WARNING: cannot perform sample generation of "
+        "deepseek models with transformers >= 4.48"
+    )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
new file mode 100644
index 000000000..3ec506c34
--- /dev/null
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
@@ -0,0 +1,101 @@
+import torch
+from datasets import load_dataset
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+# its recommended to use more calibration samples for MoE models so each expert is hit
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 2048
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# define a llmcompressor recipe for INT8 W8A8 quantization
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = [
+    GPTQModifier(
+        targets="Linear",
+        scheme="W8A8",
+        ignore=["lm_head", "re:.*mlp.gate$"],
+    ),
+]
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+)
+
+# Confirm generations of the quantized model look sane.
+# Generation is broken for deepseek models when using the latest transformers package
+if Version(__version__) < Version("4.48"):
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    SAMPLE_INPUT = ["I love quantization because"]
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
+    output = model.generate(**inputs, max_length=50)
+    text_output = tokenizer.batch_decode(output)
+    print(text_output)
+    print("==========================================")
+else:
+    print(
+        "WARNING: cannot perform sample generation of "
+        "deepseek models with transformers >= 4.48"
+    )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml
new file mode 100644
index 000000000..23f276e2f
--- /dev/null
+++ b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml
@@ -0,0 +1,8 @@
+quant_stage:
+  quant_modifiers:
+    GPTQModifier:
+      ignore: [lm_head, "re:.*mlp.gate$"]
+      config_groups:
+        group_0:
+          weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false}
+          targets: [Linear]
diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
new file mode 100644
index 000000000..a17bf873d
--- /dev/null
+++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
@@ -0,0 +1,58 @@
+from typing import List
+
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
+
+MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+
+# Dataset config parameters
+DATASET_ID = "open_platypus"
+DATASET_SPLIT = "train"
+MAX_SEQ_LENGTH = 2048
+NUM_CALIBRATION_SAMPLES = 512
+
+# Recipe
+layers_to_ignore: List[str] = [
+    "lm_head",
+    "re:.*block_sparse_moe.gate",  # does not quantize well
+]
+recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore)
+
+
+oneshot(
+    model=model,
+    tokenizer=tokenizer,
+    dataset=DATASET_ID,
+    splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"},
+    recipe=recipe,
+    max_seq_length=MAX_SEQ_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Confirm generations of the quantized model look sane.
+# Generation is broken for deepseek models when using the latest transformers package
+if Version(__version__) < Version("4.48"):
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+    output = model.generate(input_ids, max_new_tokens=20)
+    print(tokenizer.decode(output[0]))
+    print("==========================================")
+else:
+    print(
+        "WARNING: cannot perform sample generation of "
+        "deepseek models with transformers >= 4.48"
+    )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py
new file mode 100644
index 000000000..40a78a9b7
--- /dev/null
+++ b/examples/quantizing_moe/qwen_moe_w4a16.py
@@ -0,0 +1,84 @@
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.transformers import oneshot
+from llmcompressor.utils import dispatch_for_generation
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# define a llmcompressor recipe for W416 quantization with a group size of 128
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = GPTQModifier(
+    targets="Linear",
+    scheme="W4A16",
+    ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
+)
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    save_compressed=True,
+    trust_remote_code_model=True,
+)
+
+# Confirm generations of the quantized model look sane.
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=20)
+print(tokenizer.decode(output[0]))
+print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-quantized.w4a16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)

From c44da345e8dd0d63d5cc0df3cb06bad52f968fa6 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 20 Jun 2025 13:52:17 -0400
Subject: [PATCH 14/22] revert extra examples

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../quantizing_moe/deepseekv2_5_example.py    | 94 -------------------
 examples/quantizing_moe/mixtral_example.py    | 86 -----------------
 examples/quantizing_moe/qwen_example.py       | 85 -----------------
 3 files changed, 265 deletions(-)
 delete mode 100644 examples/quantizing_moe/deepseekv2_5_example.py
 delete mode 100644 examples/quantizing_moe/mixtral_example.py
 delete mode 100644 examples/quantizing_moe/qwen_example.py

diff --git a/examples/quantizing_moe/deepseekv2_5_example.py b/examples/quantizing_moe/deepseekv2_5_example.py
deleted file mode 100644
index c2b3b0305..000000000
--- a/examples/quantizing_moe/deepseekv2_5_example.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import torch
-from datasets import load_dataset
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.utils import dispatch_for_generation
-
-# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
-# Please consider either downgrading your transformers version to a
-# previous version or upgrading to a version where this bug is fixed
-
-# select a Mixture of Experts model for quantization
-MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# Configure the quantization algorithm to run.
-# since the MoE gate layers are sensitive to quantization, we add them to the ignore
-# list so they remain at full precision
-recipe = GPTQModifier(
-    targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"]
-)
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    trust_remote_code_model=True,
-)
-
-# Confirm generations of the quantized model look sane.
-# Generation is broken for deepseek models when using the latest transformers package
-if Version(__version__) < Version("4.48"):
-    print("========== SAMPLE GENERATION ==============")
-    dispatch_for_generation(model)
-    sample = tokenizer("Hello my name is", return_tensors="pt")
-    sample = {key: value.to("cuda") for key, value in sample.items()}
-    output = model.generate(**sample, max_new_tokens=100)
-    print(tokenizer.decode(output[0]))
-    print("==========================================")
-else:
-    print(
-        "WARNING: cannot perform sample generation of "
-        "deepseek models with transformers >= 4.48"
-    )
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/mixtral_example.py b/examples/quantizing_moe/mixtral_example.py
deleted file mode 100644
index 49b08c722..000000000
--- a/examples/quantizing_moe/mixtral_example.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import torch
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils import dispatch_for_generation
-
-# select a Mixture of Experts model for quantization
-MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# Configure the quantization algorithm to run.
-# since the MoE gate layers are sensitive to quantization, we add them to the ignore
-# list so they remain at full precision
-recipe = QuantizationModifier(
-    scheme="W4A16",
-    targets="Linear",
-    ignore=[
-        "lm_head",
-        "re:.*block_sparse_moe.gate",  # does not quantize well
-    ],
-)
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    trust_remote_code_model=True,
-)
-
-print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
-sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to("cuda") for key, value in sample.items()}
-output = model.generate(**sample, max_new_tokens=100)
-print(tokenizer.decode(output[0]))
-print("==========================================")
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/qwen_example.py b/examples/quantizing_moe/qwen_example.py
deleted file mode 100644
index 2531e6528..000000000
--- a/examples/quantizing_moe/qwen_example.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import torch
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
-from llmcompressor.utils import dispatch_for_generation
-
-# select a Mixture of Experts model for quantization
-MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# define a llmcompressor recipe for W416 quantization with a group size of 128
-# since the MoE gate layers are sensitive to quantization, we add them to the ignore
-# list so they remain at full precision
-recipe = GPTQModifier(
-    targets="Linear",
-    scheme="W4A16",
-    ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
-)
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    save_compressed=True,
-    trust_remote_code_model=True,
-)
-
-# Confirm generations of the quantized model look sane.
-print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
-sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to("cuda") for key, value in sample.items()}
-output = model.generate(**sample, max_new_tokens=100)
-print(tokenizer.decode(output[0]))
-print("==========================================")
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)

From 2db2789532215d744db24e3a9a31e7beabb0b2f0 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 20 Jun 2025 13:54:24 -0400
Subject: [PATCH 15/22] revert examples changes

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantizing_moe/deepseek_moe_w4a16.py | 125 ++++++++++++++++++
 .../quantizing_moe/deepseek_moe_w8a8_fp8.py   |  99 ++++++++++++++
 .../quantizing_moe/deepseek_moe_w8a8_int8.py  | 101 ++++++++++++++
 .../quantizing_moe/deepseek_recipe_w4a16.yaml |   8 ++
 .../quantizing_moe/mixtral_moe_w8a8_fp8.py    |  58 ++++++++
 examples/quantizing_moe/qwen_moe_w4a16.py     |  84 ++++++++++++
 tests/examples/test_quantizing_moe.py         |  11 +-
 7 files changed, 482 insertions(+), 4 deletions(-)
 create mode 100644 examples/quantizing_moe/deepseek_moe_w4a16.py
 create mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
 create mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_int8.py
 create mode 100644 examples/quantizing_moe/deepseek_recipe_w4a16.yaml
 create mode 100644 examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
 create mode 100644 examples/quantizing_moe/qwen_moe_w4a16.py

diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
new file mode 100644
index 000000000..9880e9248
--- /dev/null
+++ b/examples/quantizing_moe/deepseek_moe_w4a16.py
@@ -0,0 +1,125 @@
+import torch
+from datasets import load_dataset
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+
+from llmcompressor import oneshot
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# define a llmcompressor recipe for W416 quantization
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = "deepseek_recipe_w4a16.yaml"
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    save_compressed=True,
+    trust_remote_code_model=True,
+)
+
+# Confirm generations of the quantized model look sane.
+# Generation is broken for deepseek models when using the latest transformers package
+if Version(__version__) < Version("4.48"):
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+    output = model.generate(input_ids, max_new_tokens=20)
+    print(tokenizer.decode(output[0]))
+    print("==========================================")
+else:
+    print(
+        "WARNING: cannot perform sample generation of "
+        "deepseek models with transformers >= 4.48"
+    )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+
+# Run the model on vLLM
+try:
+    from vllm import LLM, SamplingParams
+
+    vllm_installed = True
+except ImportError:
+    vllm_installed = False
+
+if vllm_installed:
+    print("vLLM installed, running using vLLM")
+    sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
+    llm = LLM(
+        model=SAVE_DIR,
+        tensor_parallel_size=2,
+        trust_remote_code=True,
+        max_model_len=1042,
+        dtype=torch.half,
+    )
+    prompts = [
+        "The capital of France is",
+        "The president of the US is",
+        "My name is",
+    ]
+
+    outputs = llm.generate(prompts, sampling_params)
+    print("================= vLLM GENERATION ======================")
+    for output in outputs:
+        assert output
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print("PROMPT", prompt)
+        print("GENERATED TEXT", generated_text)
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
new file mode 100644
index 000000000..0bc9c24df
--- /dev/null
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
@@ -0,0 +1,99 @@
+from datasets import load_dataset
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype="auto", trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+# its recommended to use more calibration samples for MoE models so each expert is hit
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 2048
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# define a llmcompressor recipe for FP8 W8A8 quantization
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = [
+    QuantizationModifier(
+        targets="Linear",
+        scheme="FP8",
+        ignore=["lm_head", "re:.*mlp.gate$"],
+    ),
+]
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+)
+
+# Confirm generations of the quantized model look sane.
+# Generation is broken for deepseek models when using the latest transformers package
+if Version(__version__) < Version("4.48"):
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    SAMPLE_INPUT = ["I love quantization because"]
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
+    output = model.generate(**inputs, max_length=50)
+    text_output = tokenizer.batch_decode(output)
+    print(text_output)
+else:
+    print(
+        "WARNING: cannot perform sample generation of "
+        "deepseek models with transformers >= 4.48"
+    )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
new file mode 100644
index 000000000..3ec506c34
--- /dev/null
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
@@ -0,0 +1,101 @@
+import torch
+from datasets import load_dataset
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+# its recommended to use more calibration samples for MoE models so each expert is hit
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 2048
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# define a llmcompressor recipe for INT8 W8A8 quantization
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = [
+    GPTQModifier(
+        targets="Linear",
+        scheme="W8A8",
+        ignore=["lm_head", "re:.*mlp.gate$"],
+    ),
+]
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+)
+
+# Confirm generations of the quantized model look sane.
+# Generation is broken for deepseek models when using the latest transformers package
+if Version(__version__) < Version("4.48"):
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    SAMPLE_INPUT = ["I love quantization because"]
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
+    output = model.generate(**inputs, max_length=50)
+    text_output = tokenizer.batch_decode(output)
+    print(text_output)
+    print("==========================================")
+else:
+    print(
+        "WARNING: cannot perform sample generation of "
+        "deepseek models with transformers >= 4.48"
+    )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml
new file mode 100644
index 000000000..23f276e2f
--- /dev/null
+++ b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml
@@ -0,0 +1,8 @@
+quant_stage:
+  quant_modifiers:
+    GPTQModifier:
+      ignore: [lm_head, "re:.*mlp.gate$"]
+      config_groups:
+        group_0:
+          weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false}
+          targets: [Linear]
diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
new file mode 100644
index 000000000..a17bf873d
--- /dev/null
+++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
@@ -0,0 +1,58 @@
+from typing import List
+
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
+
+MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+
+# Dataset config parameters
+DATASET_ID = "open_platypus"
+DATASET_SPLIT = "train"
+MAX_SEQ_LENGTH = 2048
+NUM_CALIBRATION_SAMPLES = 512
+
+# Recipe
+layers_to_ignore: List[str] = [
+    "lm_head",
+    "re:.*block_sparse_moe.gate",  # does not quantize well
+]
+recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore)
+
+
+oneshot(
+    model=model,
+    tokenizer=tokenizer,
+    dataset=DATASET_ID,
+    splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"},
+    recipe=recipe,
+    max_seq_length=MAX_SEQ_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Confirm generations of the quantized model look sane.
+# Generation is broken for deepseek models when using the latest transformers package
+if Version(__version__) < Version("4.48"):
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+    output = model.generate(input_ids, max_new_tokens=20)
+    print(tokenizer.decode(output[0]))
+    print("==========================================")
+else:
+    print(
+        "WARNING: cannot perform sample generation of "
+        "deepseek models with transformers >= 4.48"
+    )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py
new file mode 100644
index 000000000..40a78a9b7
--- /dev/null
+++ b/examples/quantizing_moe/qwen_moe_w4a16.py
@@ -0,0 +1,84 @@
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.transformers import oneshot
+from llmcompressor.utils import dispatch_for_generation
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# define a llmcompressor recipe for W416 quantization with a group size of 128
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = GPTQModifier(
+    targets="Linear",
+    scheme="W4A16",
+    ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
+)
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    save_compressed=True,
+    trust_remote_code_model=True,
+)
+
+# Confirm generations of the quantized model look sane.
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=20)
+print(tokenizer.decode(output[0]))
+print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-quantized.w4a16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/tests/examples/test_quantizing_moe.py b/tests/examples/test_quantizing_moe.py
index 50e86c2c8..49686d25c 100644
--- a/tests/examples/test_quantizing_moe.py
+++ b/tests/examples/test_quantizing_moe.py
@@ -44,11 +44,14 @@ def test_doc_example_command(self, example_dir: str, tmp_path: Path):
         "script_filename",
         [
             pytest.param(
-                "deepseekv3_example.py",
-                marks=pytest.mark.skip(reason="exceptionally long run time"),
+                "deepseek_moe_w4a16.py",
+                marks=[
+                    pytest.mark.multi_gpu,
+                    pytest.mark.skip(reason="exceptionally long run time"),
+                ],
             ),
-            pytest.param("mixtral_example.py"),
-            pytest.param("qwen_example.py"),
+            pytest.param("deepseek_moe_w8a8_fp8.py"),
+            pytest.param("deepseek_moe_w8a8_int8.py", marks=pytest.mark.multi_gpu),
         ],
     )
     def test_deepseek_example_script(

From 0dc2381dd12303ae1c71127827e1efd1bc56de63 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 20 Jun 2025 13:55:09 -0400
Subject: [PATCH 16/22] remove extra examples

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantizing_moe/mixtral_example.py | 86 ----------------------
 examples/quantizing_moe/qwen_example.py    | 85 ---------------------
 2 files changed, 171 deletions(-)
 delete mode 100644 examples/quantizing_moe/mixtral_example.py
 delete mode 100644 examples/quantizing_moe/qwen_example.py

diff --git a/examples/quantizing_moe/mixtral_example.py b/examples/quantizing_moe/mixtral_example.py
deleted file mode 100644
index 49b08c722..000000000
--- a/examples/quantizing_moe/mixtral_example.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import torch
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils import dispatch_for_generation
-
-# select a Mixture of Experts model for quantization
-MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# Configure the quantization algorithm to run.
-# since the MoE gate layers are sensitive to quantization, we add them to the ignore
-# list so they remain at full precision
-recipe = QuantizationModifier(
-    scheme="W4A16",
-    targets="Linear",
-    ignore=[
-        "lm_head",
-        "re:.*block_sparse_moe.gate",  # does not quantize well
-    ],
-)
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    trust_remote_code_model=True,
-)
-
-print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
-sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to("cuda") for key, value in sample.items()}
-output = model.generate(**sample, max_new_tokens=100)
-print(tokenizer.decode(output[0]))
-print("==========================================")
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/qwen_example.py b/examples/quantizing_moe/qwen_example.py
deleted file mode 100644
index 2531e6528..000000000
--- a/examples/quantizing_moe/qwen_example.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import torch
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
-from llmcompressor.utils import dispatch_for_generation
-
-# select a Mixture of Experts model for quantization
-MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# define a llmcompressor recipe for W416 quantization with a group size of 128
-# since the MoE gate layers are sensitive to quantization, we add them to the ignore
-# list so they remain at full precision
-recipe = GPTQModifier(
-    targets="Linear",
-    scheme="W4A16",
-    ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
-)
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    save_compressed=True,
-    trust_remote_code_model=True,
-)
-
-# Confirm generations of the quantized model look sane.
-print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
-sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to("cuda") for key, value in sample.items()}
-output = model.generate(**sample, max_new_tokens=100)
-print(tokenizer.decode(output[0]))
-print("==========================================")
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)

From b70aba7e240fc6c626146c6d4bbdf7a46f0d3979 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 20 Jun 2025 13:55:39 -0400
Subject: [PATCH 17/22] revert examples tests changes

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 tests/examples/test_quantizing_moe.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/examples/test_quantizing_moe.py b/tests/examples/test_quantizing_moe.py
index 848d5e3bf..49686d25c 100644
--- a/tests/examples/test_quantizing_moe.py
+++ b/tests/examples/test_quantizing_moe.py
@@ -44,11 +44,14 @@ def test_doc_example_command(self, example_dir: str, tmp_path: Path):
         "script_filename",
         [
             pytest.param(
-                "deepseekv2_5_example.py",
-                marks=pytest.mark.skip(reason="exceptionally long run time"),
+                "deepseek_moe_w4a16.py",
+                marks=[
+                    pytest.mark.multi_gpu,
+                    pytest.mark.skip(reason="exceptionally long run time"),
+                ],
             ),
-            pytest.param("mixtral_example.py"),
-            pytest.param("qwen_example.py"),
+            pytest.param("deepseek_moe_w8a8_fp8.py"),
+            pytest.param("deepseek_moe_w8a8_int8.py", marks=pytest.mark.multi_gpu),
         ],
     )
     def test_deepseek_example_script(

From 5e5657be1babff32db1a5a1eb13389c72c1eca50 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 20 Jun 2025 13:58:01 -0400
Subject: [PATCH 18/22] Revert "revert extra examples"

This reverts commit c44da345e8dd0d63d5cc0df3cb06bad52f968fa6.
---
 .../quantizing_moe/deepseekv2_5_example.py    | 94 +++++++++++++++++++
 examples/quantizing_moe/mixtral_example.py    | 86 +++++++++++++++++
 examples/quantizing_moe/qwen_example.py       | 85 +++++++++++++++++
 3 files changed, 265 insertions(+)
 create mode 100644 examples/quantizing_moe/deepseekv2_5_example.py
 create mode 100644 examples/quantizing_moe/mixtral_example.py
 create mode 100644 examples/quantizing_moe/qwen_example.py

diff --git a/examples/quantizing_moe/deepseekv2_5_example.py b/examples/quantizing_moe/deepseekv2_5_example.py
new file mode 100644
index 000000000..c2b3b0305
--- /dev/null
+++ b/examples/quantizing_moe/deepseekv2_5_example.py
@@ -0,0 +1,94 @@
+import torch
+from datasets import load_dataset
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# Configure the quantization algorithm to run.
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = GPTQModifier(
+    targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"]
+)
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+)
+
+# Confirm generations of the quantized model look sane.
+# Generation is broken for deepseek models when using the latest transformers package
+if Version(__version__) < Version("4.48"):
+    print("========== SAMPLE GENERATION ==============")
+    dispatch_for_generation(model)
+    sample = tokenizer("Hello my name is", return_tensors="pt")
+    sample = {key: value.to("cuda") for key, value in sample.items()}
+    output = model.generate(**sample, max_new_tokens=100)
+    print(tokenizer.decode(output[0]))
+    print("==========================================")
+else:
+    print(
+        "WARNING: cannot perform sample generation of "
+        "deepseek models with transformers >= 4.48"
+    )
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/mixtral_example.py b/examples/quantizing_moe/mixtral_example.py
new file mode 100644
index 000000000..49b08c722
--- /dev/null
+++ b/examples/quantizing_moe/mixtral_example.py
@@ -0,0 +1,86 @@
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# Configure the quantization algorithm to run.
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = QuantizationModifier(
+    scheme="W4A16",
+    targets="Linear",
+    ignore=[
+        "lm_head",
+        "re:.*block_sparse_moe.gate",  # does not quantize well
+    ],
+)
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+)
+
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to("cuda") for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/qwen_example.py b/examples/quantizing_moe/qwen_example.py
new file mode 100644
index 000000000..2531e6528
--- /dev/null
+++ b/examples/quantizing_moe/qwen_example.py
@@ -0,0 +1,85 @@
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.transformers import oneshot
+from llmcompressor.utils import dispatch_for_generation
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# define a llmcompressor recipe for W416 quantization with a group size of 128
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = GPTQModifier(
+    targets="Linear",
+    scheme="W4A16",
+    ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
+)
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    save_compressed=True,
+    trust_remote_code_model=True,
+)
+
+# Confirm generations of the quantized model look sane.
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to("cuda") for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)

From 48123509037db3d2be970ad5b33b452fd73262c2 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 20 Jun 2025 14:03:47 -0400
Subject: [PATCH 19/22] clean up examples

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantizing_moe/README.md             |  44 +++---
 examples/quantizing_moe/deepseek_moe_w4a16.py | 125 ------------------
 .../quantizing_moe/deepseek_moe_w8a8_fp8.py   |  99 --------------
 .../quantizing_moe/deepseek_moe_w8a8_int8.py  | 101 --------------
 .../quantizing_moe/deepseek_recipe_w4a16.yaml |   8 --
 ...wen_moe_w4a16.py => deepseekv3_example.py} |  48 ++++---
 .../quantizing_moe/mixtral_moe_w8a8_fp8.py    |  58 --------
 src/llmcompressor/modeling/__init__.py        |   3 +
 src/llmcompressor/modeling/deepseek_v3.py     |  48 +++++++
 src/llmcompressor/modeling/prepare.py         |  22 +++
 src/llmcompressor/utils/module.py             |  27 ++++
 tests/examples/test_quantizing_moe.py         |  15 ++-
 12 files changed, 156 insertions(+), 442 deletions(-)
 delete mode 100644 examples/quantizing_moe/deepseek_moe_w4a16.py
 delete mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
 delete mode 100644 examples/quantizing_moe/deepseek_moe_w8a8_int8.py
 delete mode 100644 examples/quantizing_moe/deepseek_recipe_w4a16.yaml
 rename examples/quantizing_moe/{qwen_moe_w4a16.py => deepseekv3_example.py} (52%)
 delete mode 100644 examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
 create mode 100644 src/llmcompressor/modeling/__init__.py
 create mode 100644 src/llmcompressor/modeling/deepseek_v3.py
 create mode 100644 src/llmcompressor/modeling/prepare.py
 create mode 100644 src/llmcompressor/utils/module.py

diff --git a/examples/quantizing_moe/README.md b/examples/quantizing_moe/README.md
index 70243caf1..f2a162d91 100644
--- a/examples/quantizing_moe/README.md
+++ b/examples/quantizing_moe/README.md
@@ -1,6 +1,6 @@
-# Quantizing Mixtral-8x7B-Instruct-v0.1 Model with FP8
+# Quantizing Mixtral-8x7B-Instruct-v0.1 Model with W4A16
 
-This directory contains an example script for quantizing the `Mixtral-8x7B-Instruct-v0.1` model using the static per-tensor FP8 quantization scheme.
+This directory contains an example script for quantizing the `Mixtral-8x7B-Instruct-v0.1` model using the static per-tensor W4A16 quantization scheme.
 
 ## Installation
 
@@ -17,17 +17,17 @@ pip install -e .
 The provided example script demonstrates an end-to-end process for applying the quantization algorithm:
 
 ```bash
-python3 mixtral_moe_w8a8_fp8.py
+python3 mixtral_example.py
 ```
 
 ## Creating a Quantized MoE Model
 
-This example leverages `llm-compressor` and `compressed-tensors` to create an FP8-quantized `Mixtral-8x7B-Instruct-v0.1` model. The model is calibrated and trained using the `open_platypus` dataset.
+This example leverages `llm-compressor` and `compressed-tensors` to create an W4A16-quantized `Mixtral-8x7B-Instruct-v0.1` model. The model is calibrated and trained using the `ultrachat_200k` dataset.
 
 You can follow the detailed steps below or simply run the example script with:
 
 ```bash
-python mixtral_moe_w8a8_fp8.py
+python mixtral_example.py
 ```
 
 ### Step 1: Select a Model, Dataset, and Recipe
@@ -36,24 +36,24 @@ In this step, you'll choose a baseline model for quantization, a dataset for cal
 
 - **Models**: Can be referenced from a local directory or retrieved from the Hugging Face Hub.
 - **Datasets**: Can also be from a local directory or the Hugging Face Hub.
-- **Recipes**: These are YAML files or Python modifier objects that describe how a model should be optimized during or after training. In this example, we use a `QuantizationModifier` object with the scheme set to `FP8`.
+- **Recipes**: These are YAML files or Python modifier objects that describe how a model should be optimized during or after training. In this example, we use a `QuantizationModifier` object with the scheme set to `W4A16`.
 
 ```python
 from llmcompressor.modifiers.quantization import QuantizationModifier
 
-recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=["lm_head", "re:.*block_sparse_moe.gate"])
+recipe = QuantizationModifier(scheme="W4A16", targets="Linear", ignore=["lm_head", "re:.*block_sparse_moe.gate"])
 ```
 
 NOTE: `.*block_sparse_moe.gate` layers do not quantize well, hence they are ignored!
 
 ### Step 2: Run Quantization Using Oneshot
 
-The `oneshot` method applies the selected recipe to your model and dataset without requiring any fine-tuning. The model will be sparsified and saved to `Mixtral-8x7B-Instruct-v0.1-FP8`.
+The `oneshot` method applies the selected recipe to your model and dataset without requiring any fine-tuning. The model will be sparsified and saved to `Mixtral-8x7B-Instruct-v0.1-W4A16-G128`.
 
 ```python
 from llmcompressor import oneshot
 
-output_dir = "Mixtral-8x7B-Instruct-v0.1-FP8"
+output_dir = "Mixtral-8x7B-Instruct-v0.1-W4A16-G128"
 
 oneshot(
     model=model,
@@ -74,7 +74,7 @@ NOTE: Only per-tensor quantization is supported in vLLM as of now (`vllm==0.6.1`
 
 The repository supports multiple quantization techniques configured via a recipe. Supported strategies include `tensor`, `group`, and `channel` quantization.
 
-In the above example, FP8 per-tensor quantization is used as specified by the `FP8` scheme. For other preset schemes, refer to the [quantization schemes](https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py) in the `compressed-tensors` library.
+In the above example, quantization is specified by the `W4A18` scheme. For other preset schemes, refer to the [quantization schemes](https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py) in the `compressed-tensors` library.
 
 A custom scheme can also be specified using `config_groups`:
 
@@ -84,18 +84,18 @@ A custom scheme can also be specified using `config_groups`:
 from llmcompressor.modifiers.quantization.gptq import GPTQModifier
 
 config_groups = {
-                "group_0": {
-                    "targets": ["Linear"],
-                    "input_activations": None,
-                    "output_activations": None,
-                    "weights": {
-                        "num_bits": 8,
-                        "type": "int",
-                        "symmetric": true,
-                        "strategy": "group",
-                        "group_size": 128, 
-                    }
-               }
+    "group_0": {
+        "targets": ["Linear"],
+        "input_activations": None,
+        "output_activations": None,
+        "weights": {
+            "num_bits": 8,
+            "type": "int",
+            "symmetric": true,
+            "strategy": "group",
+            "group_size": 128, 
+        }
+    }
 }
 
 recipe = GPTQModifier(config_groups=config_groups)
diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
deleted file mode 100644
index 9880e9248..000000000
--- a/examples/quantizing_moe/deepseek_moe_w4a16.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import torch
-from datasets import load_dataset
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
-
-from llmcompressor import oneshot
-from llmcompressor.utils import dispatch_for_generation
-
-# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
-# Please consider either downgrading your transformers version to a
-# previous version or upgrading to a version where this bug is fixed
-
-# select a Mixture of Experts model for quantization
-MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# define a llmcompressor recipe for W416 quantization
-# since the MoE gate layers are sensitive to quantization, we add them to the ignore
-# list so they remain at full precision
-recipe = "deepseek_recipe_w4a16.yaml"
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    save_compressed=True,
-    trust_remote_code_model=True,
-)
-
-# Confirm generations of the quantized model look sane.
-# Generation is broken for deepseek models when using the latest transformers package
-if Version(__version__) < Version("4.48"):
-    print("========== SAMPLE GENERATION ==============")
-    dispatch_for_generation(model)
-    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-    output = model.generate(input_ids, max_new_tokens=20)
-    print(tokenizer.decode(output[0]))
-    print("==========================================")
-else:
-    print(
-        "WARNING: cannot perform sample generation of "
-        "deepseek models with transformers >= 4.48"
-    )
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-
-# Run the model on vLLM
-try:
-    from vllm import LLM, SamplingParams
-
-    vllm_installed = True
-except ImportError:
-    vllm_installed = False
-
-if vllm_installed:
-    print("vLLM installed, running using vLLM")
-    sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
-    llm = LLM(
-        model=SAVE_DIR,
-        tensor_parallel_size=2,
-        trust_remote_code=True,
-        max_model_len=1042,
-        dtype=torch.half,
-    )
-    prompts = [
-        "The capital of France is",
-        "The president of the US is",
-        "My name is",
-    ]
-
-    outputs = llm.generate(prompts, sampling_params)
-    print("================= vLLM GENERATION ======================")
-    for output in outputs:
-        assert output
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print("PROMPT", prompt)
-        print("GENERATED TEXT", generated_text)
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
deleted file mode 100644
index 0bc9c24df..000000000
--- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
+++ /dev/null
@@ -1,99 +0,0 @@
-from datasets import load_dataset
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils import dispatch_for_generation
-
-# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
-# Please consider either downgrading your transformers version to a
-# previous version or upgrading to a version where this bug is fixed
-
-# select a Mixture of Experts model for quantization
-MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype="auto", trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-# its recommended to use more calibration samples for MoE models so each expert is hit
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 2048
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# define a llmcompressor recipe for FP8 W8A8 quantization
-# since the MoE gate layers are sensitive to quantization, we add them to the ignore
-# list so they remain at full precision
-recipe = [
-    QuantizationModifier(
-        targets="Linear",
-        scheme="FP8",
-        ignore=["lm_head", "re:.*mlp.gate$"],
-    ),
-]
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    trust_remote_code_model=True,
-)
-
-# Confirm generations of the quantized model look sane.
-# Generation is broken for deepseek models when using the latest transformers package
-if Version(__version__) < Version("4.48"):
-    print("========== SAMPLE GENERATION ==============")
-    dispatch_for_generation(model)
-    SAMPLE_INPUT = ["I love quantization because"]
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
-    output = model.generate(**inputs, max_length=50)
-    text_output = tokenizer.batch_decode(output)
-    print(text_output)
-else:
-    print(
-        "WARNING: cannot perform sample generation of "
-        "deepseek models with transformers >= 4.48"
-    )
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
deleted file mode 100644
index 3ec506c34..000000000
--- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import torch
-from datasets import load_dataset
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.utils import dispatch_for_generation
-
-# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
-# Please consider either downgrading your transformers version to a
-# previous version or upgrading to a version where this bug is fixed
-
-# select a Mixture of Experts model for quantization
-MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-# its recommended to use more calibration samples for MoE models so each expert is hit
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 2048
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-# define a llmcompressor recipe for INT8 W8A8 quantization
-# since the MoE gate layers are sensitive to quantization, we add them to the ignore
-# list so they remain at full precision
-recipe = [
-    GPTQModifier(
-        targets="Linear",
-        scheme="W8A8",
-        ignore=["lm_head", "re:.*mlp.gate$"],
-    ),
-]
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    trust_remote_code_model=True,
-)
-
-# Confirm generations of the quantized model look sane.
-# Generation is broken for deepseek models when using the latest transformers package
-if Version(__version__) < Version("4.48"):
-    print("========== SAMPLE GENERATION ==============")
-    dispatch_for_generation(model)
-    SAMPLE_INPUT = ["I love quantization because"]
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
-    output = model.generate(**inputs, max_length=50)
-    text_output = tokenizer.batch_decode(output)
-    print(text_output)
-    print("==========================================")
-else:
-    print(
-        "WARNING: cannot perform sample generation of "
-        "deepseek models with transformers >= 4.48"
-    )
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml b/examples/quantizing_moe/deepseek_recipe_w4a16.yaml
deleted file mode 100644
index 23f276e2f..000000000
--- a/examples/quantizing_moe/deepseek_recipe_w4a16.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-quant_stage:
-  quant_modifiers:
-    GPTQModifier:
-      ignore: [lm_head, "re:.*mlp.gate$"]
-      config_groups:
-        group_0:
-          weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false}
-          targets: [Linear]
diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/deepseekv3_example.py
similarity index 52%
rename from examples/quantizing_moe/qwen_moe_w4a16.py
rename to examples/quantizing_moe/deepseekv3_example.py
index 40a78a9b7..1b4c334ff 100644
--- a/examples/quantizing_moe/qwen_moe_w4a16.py
+++ b/examples/quantizing_moe/deepseekv3_example.py
@@ -1,29 +1,31 @@
-import torch
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor.modeling import prepare_for_quantization
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
 from llmcompressor.utils import dispatch_for_generation
 
-# select a Mixture of Experts model for quantization
-MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+# Select model and load it.
+# For DeepSeekv3, we require a full precision model in order to properly calibrate
+# `DeepSeek-V3-BF16` is a DeepSeek-V3 FP8 model which has been converted to BF16
+model_id = "RedHatAI/DeepSeek-V3-BF16"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = prepare_for_quantization(model)
 
 # Select calibration dataset.
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
 DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
 NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
 
-
 # Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
 
 
 def preprocess(example):
@@ -51,34 +53,36 @@ def tokenize(sample):
 
 ds = ds.map(tokenize, remove_columns=ds.column_names)
 
-# define a llmcompressor recipe for W416 quantization with a group size of 128
+# Configure the quantization algorithm to run.
 # since the MoE gate layers are sensitive to quantization, we add them to the ignore
 # list so they remain at full precision
 recipe = GPTQModifier(
-    targets="Linear",
-    scheme="W4A16",
-    ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
+    targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"]
 )
 
+# Apply algorithms.
+# due to the large size of DeepSeekV3, we specify sequential targets such that
+# only one MLP is loaded into GPU memory at a time
 oneshot(
     model=model,
     dataset=ds,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    save_compressed=True,
-    trust_remote_code_model=True,
+    sequential_targets=["DeepseekV3Attention", "DeepseekV3MLP"],
 )
 
 # Confirm generations of the quantized model look sane.
+print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=20)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to("cuda") for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
-print("==========================================")
+print("==========================================\n\n")
 
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-quantized.w4a16"
+# Save to disk compressed.
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
deleted file mode 100644
index a17bf873d..000000000
--- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from typing import List
-
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils import dispatch_for_generation
-
-MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-
-# Dataset config parameters
-DATASET_ID = "open_platypus"
-DATASET_SPLIT = "train"
-MAX_SEQ_LENGTH = 2048
-NUM_CALIBRATION_SAMPLES = 512
-
-# Recipe
-layers_to_ignore: List[str] = [
-    "lm_head",
-    "re:.*block_sparse_moe.gate",  # does not quantize well
-]
-recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore)
-
-
-oneshot(
-    model=model,
-    tokenizer=tokenizer,
-    dataset=DATASET_ID,
-    splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"},
-    recipe=recipe,
-    max_seq_length=MAX_SEQ_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-)
-
-# Confirm generations of the quantized model look sane.
-# Generation is broken for deepseek models when using the latest transformers package
-if Version(__version__) < Version("4.48"):
-    print("========== SAMPLE GENERATION ==============")
-    dispatch_for_generation(model)
-    input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-    output = model.generate(input_ids, max_new_tokens=20)
-    print(tokenizer.decode(output[0]))
-    print("==========================================")
-else:
-    print(
-        "WARNING: cannot perform sample generation of "
-        "deepseek models with transformers >= 4.48"
-    )
-
-# Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py
new file mode 100644
index 000000000..e2c22ed1f
--- /dev/null
+++ b/src/llmcompressor/modeling/__init__.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+
+from .prepare import *
diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py
new file mode 100644
index 000000000..4b885ff64
--- /dev/null
+++ b/src/llmcompressor/modeling/deepseek_v3.py
@@ -0,0 +1,48 @@
+import torch
+from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE
+
+
+class DeepseekV3MoECalibrate(torch.nn.Module):
+    def __init__(self, config, experts, gate, shared_experts):
+        super().__init__()
+        self.config = config
+        self.experts = experts
+        self.gate = gate
+        self.shared_experts = shared_experts
+
+    def forward(self, hidden_states):
+        residuals = hidden_states
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+
+        # Begin MoE
+        final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+        expert_mask = torch.nn.functional.one_hot(
+            topk_indices, num_classes=len(self.experts)
+        )
+        expert_mask = expert_mask.permute(2, 0, 1)
+
+        for expert_idx in range(len(self.experts)):
+            expert = self.experts[expert_idx]
+            mask = expert_mask[expert_idx]
+            token_indices, weight_indices = torch.where(mask)
+
+            expert_weights = topk_weights[token_indices, weight_indices]
+            expert_input = hidden_states[token_indices]
+            expert_output = expert(expert_input)
+            weighted_output = expert_output * expert_weights.unsqueeze(-1)
+
+            if token_indices.numel() > 0:
+                final_hidden_states.index_add_(0, token_indices, weighted_output)
+        # End MoE
+
+        hidden_states = final_hidden_states.type(hidden_states.dtype).view(*orig_shape)
+        hidden_states = hidden_states + self.shared_experts(residuals)
+        return hidden_states
+
+
+def replace(module: DeepseekV3MoE) -> DeepseekV3MoECalibrate:
+    return DeepseekV3MoECalibrate(
+        module.config, module.experts, module.gate, module.shared_experts
+    )
diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py
new file mode 100644
index 000000000..a8dedf8ee
--- /dev/null
+++ b/src/llmcompressor/modeling/prepare.py
@@ -0,0 +1,22 @@
+import torch
+from transformers import PreTrainedModel
+from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE
+
+from llmcompressor.modeling.deepseek_v3 import replace as replace_DeepseekV3MoE
+from llmcompressor.utils.module import module_bfs
+
+__all__ = ["prepare_for_quantization"]
+
+replacements = {
+    DeepseekV3MoE: replace_DeepseekV3MoE,
+}
+
+
+def prepare_for_quantization(model: PreTrainedModel) -> PreTrainedModel:
+    def replace(module: torch.nn.Module) -> torch.nn.Module:
+        if module.__class__ in replacements:
+            return replacements[module.__class__](module)
+        else:
+            return module
+
+    return module_bfs(model, replace, progress=True)
diff --git a/src/llmcompressor/utils/module.py b/src/llmcompressor/utils/module.py
new file mode 100644
index 000000000..a02aa8b4a
--- /dev/null
+++ b/src/llmcompressor/utils/module.py
@@ -0,0 +1,27 @@
+from typing import Callable, Union
+
+import torch
+import tqdm
+
+__all__ = ["module_bfs"]
+
+
+def module_bfs(
+    module: torch.nn.Module,
+    func: Callable[[torch.nn.Module], torch.nn.Module],
+    pre: bool = True,
+    progress: Union[bool, tqdm.tqdm] = False,
+) -> torch.nn.Module:
+    if progress is True:
+        total = len(list(module.modules()))
+        progress = tqdm.tqdm(total=total)
+    if pre:
+        module = func(module)
+    for name, child in list(module.named_children()):
+        module.add_module(name, module_bfs(child, func, pre, progress))
+    if not pre:
+        module = func(module)
+    if isinstance(progress, tqdm.tqdm):
+        progress.update(1)
+
+    return module
diff --git a/tests/examples/test_quantizing_moe.py b/tests/examples/test_quantizing_moe.py
index 49686d25c..1f5a53a56 100644
--- a/tests/examples/test_quantizing_moe.py
+++ b/tests/examples/test_quantizing_moe.py
@@ -44,14 +44,15 @@ def test_doc_example_command(self, example_dir: str, tmp_path: Path):
         "script_filename",
         [
             pytest.param(
-                "deepseek_moe_w4a16.py",
-                marks=[
-                    pytest.mark.multi_gpu,
-                    pytest.mark.skip(reason="exceptionally long run time"),
-                ],
+                "deepseekv2_5_example.py",
+                marks=pytest.mark.skip(reason="exceptionally long run time"),
             ),
-            pytest.param("deepseek_moe_w8a8_fp8.py"),
-            pytest.param("deepseek_moe_w8a8_int8.py", marks=pytest.mark.multi_gpu),
+            pytest.param(
+                "deepseekv3_example.py",
+                marks=pytest.mark.skip(reason="exceptionally long run time"),
+            ),
+            pytest.param("mixtral_example.py"),
+            pytest.param("qwen_example.py"),
         ],
     )
     def test_deepseek_example_script(

From 626000d1b80cc90bce483334c71f2b3f952c9f48 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 26 Jun 2025 14:26:10 -0400
Subject: [PATCH 20/22] merge with main src

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/args/dataset_arguments.py   |   7 -
 src/llmcompressor/args/model_arguments.py     |   7 -
 src/llmcompressor/entrypoints/oneshot.py      |   2 -
 src/llmcompressor/metrics/logger.py           | 169 ++++++++----------
 src/llmcompressor/modeling/deepseek_v3.py     |   4 +
 src/llmcompressor/modeling/prepare.py         |  22 ++-
 .../modifiers/obcq/sgpt_sparsify.py           |   4 +-
 .../modifiers/quantization/calibration.py     |  15 ++
 src/llmcompressor/observers/min_max.py        |   1 +
 src/llmcompressor/observers/mse.py            |   9 +-
 src/llmcompressor/pipelines/basic/pipeline.py |   6 +-
 .../pipelines/layer_sequential/pipeline.py    |   7 +-
 .../pipelines/sequential/pipeline.py          |   7 +-
 .../compressed_tensors_utils.py               |  19 --
 src/llmcompressor/utils/dev.py                |  28 +--
 15 files changed, 117 insertions(+), 190 deletions(-)

diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
index 677d09daa..33193cde2 100644
--- a/src/llmcompressor/args/dataset_arguments.py
+++ b/src/llmcompressor/args/dataset_arguments.py
@@ -197,10 +197,3 @@ class DatasetArguments(CustomDatasetArguments):
             "definition"
         },
     )
-    model_input_device: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Device to put model inputs on for calibration. "
-            "If none is specified, the model input device is inferred from the model"
-        },
-    )
diff --git a/src/llmcompressor/args/model_arguments.py b/src/llmcompressor/args/model_arguments.py
index ea3c3936a..e68bd16aa 100644
--- a/src/llmcompressor/args/model_arguments.py
+++ b/src/llmcompressor/args/model_arguments.py
@@ -80,13 +80,6 @@ class ModelArguments:
         default=True,
         metadata={"help": "Whether to compress sparse models during save"},
     )
-    oneshot_device: Optional[str] = field(
-        default="cuda",
-        metadata={
-            "help": "This argument is deprecated and nonfunctional "
-            "and will be removed in future release"
-        },
-    )
     model_revision: str = field(
         default="main",
         metadata={
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index 945c71943..707aafedf 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -208,7 +208,6 @@ def oneshot(
     tie_word_embeddings: bool = False,
     trust_remote_code_model: bool = False,
     save_compressed: bool = True,
-    oneshot_device: str = "cuda:0",
     model_revision: str = "main",
     # Recipe arguments
     recipe: Optional[Union[str, List[str]]] = None,
@@ -259,7 +258,6 @@ def oneshot(
     :param trust_remote_code_model: Whether to allow for custom models to execute
         their own modeling files.
     :param save_compressed: Whether to compress sparse models during save.
-    :param oneshot_device: Device to run oneshot calibration on.
     :param model_revision: The specific model version to use (can be branch name,
         tag, or commit id).
 
diff --git a/src/llmcompressor/metrics/logger.py b/src/llmcompressor/metrics/logger.py
index b4c2f9505..8c895143b 100644
--- a/src/llmcompressor/metrics/logger.py
+++ b/src/llmcompressor/metrics/logger.py
@@ -2,18 +2,18 @@
 Contains code for loggers that help visualize the information from each modifier
 """
 
-import logging
 import os
 import time
 import warnings
 from abc import ABC
 from contextlib import contextmanager
 from datetime import datetime
-from logging import CRITICAL, DEBUG, ERROR, INFO, WARN, Logger
 from pathlib import Path
 from types import ModuleType
 from typing import Any, Callable, Dict, List, Optional, Union
 
+from loguru import logger
+
 from llmcompressor.metrics.utils import (
     FrequencyManager,
     FrequencyType,
@@ -52,16 +52,8 @@
     "WANDBLogger",
     "SparsificationGroupLogger",
     "LoggerManager",
-    "LOGGING_LEVELS",
 ]
 ALL_TOKEN = "__ALL__"
-LOGGING_LEVELS = {
-    "debug": DEBUG,
-    "info": INFO,
-    "warn": WARN,
-    "error": ERROR,
-    "critical": CRITICAL,
-}
 DEFAULT_TAG = "defaul_tag"
 
 
@@ -231,11 +223,12 @@ def lambda_func(
     def log_hyperparams(
         self,
         params: Dict,
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ) -> bool:
         """
         :param params: Each key-value pair in the dictionary is the name of the
             hyper parameter and it's corresponding value.
+        :param level: minimum severity level for the log message
         :return: True if logged, False otherwise.
         """
         if not self.enabled:
@@ -256,7 +249,7 @@ def log_scalar(
         value: float,
         step: Optional[int] = None,
         wall_time: Optional[float] = None,
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ) -> bool:
         """
         :param tag: identifying tag to log the value with
@@ -264,6 +257,7 @@ def log_scalar(
         :param step: global step for when the value was taken
         :param wall_time: global wall time for when the value was taken,
             defaults to time.time()
+        :param level: minimum severity level for the log message
         :param kwargs: additional logging arguments to support Python and custom loggers
         :return: True if logged, False otherwise.
         """
@@ -285,7 +279,7 @@ def log_scalars(
         values: Dict[str, float],
         step: Optional[int] = None,
         wall_time: Optional[float] = None,
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ) -> bool:
         """
         :param tag: identifying tag to log the values with
@@ -293,6 +287,7 @@ def log_scalars(
         :param step: global step for when the values were taken
         :param wall_time: global wall time for when the values were taken,
             defaults to time.time()
+        :param level: minimum severity level for the log message
         :param kwargs: additional logging arguments to support Python and custom loggers
         :return: True if logged, False otherwise.
         """
@@ -313,22 +308,20 @@ class PythonLogger(LambdaLogger):
     """
     Modifier metrics that handles printing values into a python metrics instance.
 
-    :param logger: a metrics instance to log to, if None then will create it's own
-    :param log_level: default level to log any incoming data at on the logging.Logger
-        instance when an explicit log level isn't provided
     :param name: name given to the metrics, used for identification;
         defaults to python
     :param enabled: True to log, False otherwise
     """
 
+    # Class-level variable to track if file sink is created
+    _global_file_sink_id = None
+
     def __init__(
         self,
-        logger: Logger = None,
-        log_level: int = None,
         name: str = "python",
         enabled: bool = True,
     ):
-        self._logger = logger or self._create_default_logger(log_level=log_level)
+        self._create_default_logger()
 
         super().__init__(
             lambda_func=self._log_lambda,
@@ -336,17 +329,7 @@ def __init__(
             enabled=enabled,
         )
 
-    def __getattr__(self, item):
-        return getattr(self._logger, item)
-
-    @property
-    def logger(self) -> Logger:
-        """
-        :return: a metrics instance to log to, if None then will create it's own
-        """
-        return self._logger
-
-    def _create_default_logger(self, log_level: Optional[int] = None) -> logging.Logger:
+    def _create_default_logger(self) -> None:
         """
         Create a default modifier metrics,
         with a file handler logging at the debug level
@@ -355,24 +338,9 @@ def _create_default_logger(self, log_level: Optional[int] = None) -> logging.Log
         :param log_level: logging level for the console metrics
         :return: metrics
         """
-        logger = logging.getLogger(__name__)
-
-        # Console handler, for logging high level modifier logs
-        # must be created before the file handler
-        # as file handler is also a stream handler
-        if not any(
-            isinstance(handler, logging.StreamHandler) for handler in logger.handlers
-        ):
-            stream_handler = logging.StreamHandler()
-            stream_handler.setLevel(
-                log_level or logging.getLogger("llmcompressor").level
-            )
-            logger.addHandler(stream_handler)
 
         # File handler setup, for logging modifier debug statements
-        if not any(
-            isinstance(handler, logging.FileHandler) for handler in logger.handlers
-        ):
+        if PythonLogger._global_file_sink_id is None:
             base_log_path = (
                 os.environ.get("NM_TEST_LOG_DIR")
                 if os.environ.get("NM_TEST_MODE")
@@ -382,19 +350,11 @@ def _create_default_logger(self, log_level: Optional[int] = None) -> logging.Log
             dt_string = now.strftime("%d-%m-%Y_%H.%M.%S")
             log_path = os.path.join(base_log_path, f"{dt_string}.log")
             os.makedirs(base_log_path, exist_ok=True)
-            file_handler = logging.FileHandler(
-                log_path,
-                delay=True,
+            PythonLogger._global_file_sink_id = logger.add(
+                log_path, level="DEBUG", delay=True
             )
-            file_handler.setLevel(LOGGING_LEVELS["debug"])
-            logger.addHandler(file_handler)
             logger.info(f"Logging all LLM Compressor modifier-level logs to {log_path}")
 
-        logger.setLevel(LOGGING_LEVELS["debug"])
-        logger.propagate = False
-
-        return logger
-
     def _log_lambda(
         self,
         tag: Optional[str],
@@ -402,7 +362,7 @@ def _log_lambda(
         values: Optional[Dict[str, float]],
         step: Optional[int],
         wall_time: Optional[float],
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ) -> bool:
         """
         :param tag: identifying tag to log the values with
@@ -411,13 +371,22 @@ def _log_lambda(
         :param step: global step for when the values were taken
         :param wall_time: global wall time for when the values were taken,
             defaults to time.time()
-        :param level: level to log at. Corresponds to default logging package levels
+        :param level: minimum severity level for the log message
         :return: True if logged, False otherwise.
         """
         if not level:
-            level = LOGGING_LEVELS["debug"]
+            level = "DEBUG"
+
+        def is_higher_than_debug(lev: Optional[Union[int, str]] = None) -> bool:
+            """Check if the given level is higher than DEBUG level."""
+            debug_level_no = logger.level("DEBUG").no
+            if isinstance(lev, int):
+                return level > debug_level_no
+            elif isinstance(lev, str):
+                return logger.level(lev).no > debug_level_no
+            return False
 
-        if level > LOGGING_LEVELS["debug"]:
+        if is_higher_than_debug(level):
             if step is not None:
                 format = "%s %s step %s: %s"
                 log_args = [
@@ -433,7 +402,7 @@ def _log_lambda(
             format = "%s %s [%s - %s]: %s"
             log_args = [self.name, tag, step, wall_time, values or value]
 
-        self._logger.log(level, format, *log_args)
+        logger.log(level, format, *log_args)
 
         return True
 
@@ -443,7 +412,7 @@ def log_string(
         string: Optional[str],
         step: Optional[int],
         wall_time: Optional[float] = None,
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ) -> bool:
         """
         :param tag: identifying tag to log the values with
@@ -451,7 +420,7 @@ def log_string(
         :param step: global step for when the values were taken
         :param wall_time: global wall time for when the values were taken,
             defaults to time.time()
-        :param level: level to log at. Corresponds to default logging package levels
+        :param level: minimum severity level for the log message
         :return: True if logged, False otherwise.
         """
         if not wall_time:
@@ -540,7 +509,7 @@ def _log_lambda(
         values: Optional[Dict[str, float]],
         step: Optional[int],
         wall_time: Optional[float],
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ) -> bool:
         if value is not None:
             self._writer.add_scalar(tag, value, step, wall_time)
@@ -614,7 +583,7 @@ def _log_lambda(
         values: Optional[Dict[str, float]],
         step: Optional[int],
         wall_time: Optional[float],
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ) -> bool:
         params = {}
 
@@ -656,11 +625,10 @@ class SparsificationGroupLogger(BaseLogger):
     :param lambda_func: an optional lambda function to call back into with any logs.
         The expected call sequence is (tag, value, values, step, wall_time) -> bool
         The return type is True if logged and False otherwise.
-    :param python: an optional argument for logging to a python metrics.
-        May be a logging.Logger instance to log to, True to create a metrics instance,
-        or non truthy to not log anything (False, None)
+    :param python: an bool argument for logging to a python metrics.
+        True to create a metrics instance, or False to not log anything
     :param python_log_level: if python,
-        the level to log any incoming data at on the logging.Logger instance
+        the level to log any incoming data at on the loguru.logger instance
     :param tensorboard: an optional argument for logging to a tensorboard writer.
         May be a SummaryWriter instance to log to, a string representing the directory
         to create a new SummaryWriter to log to, True to create a new SummaryWriter,
@@ -688,8 +656,8 @@ def __init__(
                 bool,
             ]
         ] = None,
-        python: Optional[Union[bool, Logger]] = None,
-        python_log_level: int = logging.INFO,
+        python: bool = False,
+        python_log_level: Optional[Union[int, str]] = "INFO",
         tensorboard: Optional[Union[bool, str, SummaryWriter]] = None,
         wandb_: Optional[Union[bool, Dict]] = None,
         name: str = "sparsification",
@@ -706,8 +674,6 @@ def __init__(
         if python:
             self._loggers.append(
                 PythonLogger(
-                    logger=python if isinstance(python, Logger) else None,
-                    log_level=python_log_level,
                     name=name,
                     enabled=enabled,
                 )
@@ -741,8 +707,8 @@ def enabled(self, value: bool):
         """
         self._enabled = value
 
-        for logger in self._loggers:
-            logger.enabled = value
+        for log in self._loggers:
+            log.enabled = value
 
     @property
     def loggers(self) -> List[BaseLogger]:
@@ -751,13 +717,13 @@ def loggers(self) -> List[BaseLogger]:
         """
         return self._loggers
 
-    def log_hyperparams(self, params: Dict, level: Optional[int] = None):
+    def log_hyperparams(self, params: Dict, level: Optional[Union[int, str]] = None):
         """
         :param params: Each key-value pair in the dictionary is the name of the
             hyper parameter and it's corresponding value.
         """
-        for logger in self._loggers:
-            logger.log_hyperparams(params, level)
+        for log in self._loggers:
+            log.log_hyperparams(params, level)
 
     def log_scalar(
         self,
@@ -765,7 +731,7 @@ def log_scalar(
         value: float,
         step: Optional[int] = None,
         wall_time: Optional[float] = None,
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ):
         """
         :param tag: identifying tag to log the value with
@@ -773,9 +739,10 @@ def log_scalar(
         :param step: global step for when the value was taken
         :param wall_time: global wall time for when the value was taken,
             defaults to time.time()
+        :param level: minimum severity level for the log message
         """
-        for logger in self._loggers:
-            logger.log_scalar(tag, value, step, wall_time, level)
+        for log in self._loggers:
+            log.log_scalar(tag, value, step, wall_time, level)
 
     def log_scalars(
         self,
@@ -783,7 +750,7 @@ def log_scalars(
         values: Dict[str, float],
         step: Optional[int] = None,
         wall_time: Optional[float] = None,
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ):
         """
         :param tag: identifying tag to log the values with
@@ -791,9 +758,10 @@ def log_scalars(
         :param step: global step for when the values were taken
         :param wall_time: global wall time for when the values were taken,
             defaults to time.time()
+        :param level: minimum severity level for the log message
         """
-        for logger in self._loggers:
-            logger.log_scalars(tag, values, step, wall_time, level)
+        for log in self._loggers:
+            log.log_scalars(tag, values, step, wall_time, level)
 
 
 class LoggerManager(ABC):
@@ -956,7 +924,7 @@ def log_scalar(
         step: Optional[int] = None,
         wall_time: Optional[float] = None,
         log_types: Union[str, List[str]] = ALL_TOKEN,
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ):
         """
         (Note: this method is deprecated and will be removed in a future version,
@@ -966,6 +934,7 @@ def log_scalar(
         :param value: value to save
         :param step: global step for when the value was taken
         :param wall_time: global wall time for when the value was taken
+        :param level: minimum severity level for the log message
         :param kwargs: additional logging arguments to support Python and custom loggers
         :return: True if logged, False otherwise.
         """
@@ -986,7 +955,7 @@ def log_scalars(
         step: Optional[int] = None,
         wall_time: Optional[float] = None,
         log_types: Union[str, List[str]] = ALL_TOKEN,
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ):
         """
         (Note: this method is deprecated and will be removed in a future version,
@@ -996,6 +965,7 @@ def log_scalars(
         :param values: values to save
         :param step: global step for when the values were taken
         :param wall_time: global wall time for when the values were taken
+        :param level: minimum severity level for the log message
         :param kwargs: additional logging arguments to support Python and custom loggers
         :return: True if logged, False otherwise.
         """
@@ -1013,7 +983,7 @@ def log_hyperparams(
         self,
         params: Dict,
         log_types: Union[str, List[str]] = ALL_TOKEN,
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ):
         """
         (Note: this method is deprecated and will be removed in a future version,
@@ -1036,7 +1006,7 @@ def log_string(
         step: Optional[int] = None,
         wall_time: Optional[float] = None,
         log_types: Union[str, List[str]] = ALL_TOKEN,
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ):
         """
         (Note: this method is deprecated and will be removed in a future version,
@@ -1047,6 +1017,7 @@ def log_string(
         :param step: global step for when the values were taken
         :param wall_time: global wall time for when the values were taken
         :param kwargs: additional logging arguments to support Python and custom loggers
+        :param level: minimum severity level for the log message
         :return: True if logged, False otherwise.
         """
         self.system.log_string(
@@ -1119,13 +1090,14 @@ def log_string(
         step: Optional[int] = None,
         wall_time: Optional[float] = None,
         log_types: Union[str, List[str]] = ALL_TOKEN,
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ):
         """
         :param tag: identifying tag to log the values with
         :param values: values to save
         :param step: global step for when the values were taken
         :param wall_time: global wall time for when the values were taken
+        :param level: minimum severity level for the log message
         :param kwargs: additional logging arguments to support Python and custom loggers
         :return: True if logged, False otherwise.
         """
@@ -1151,7 +1123,7 @@ def debug(self, tag, string, *args, **kwargs):
         :param kwargs: additional arguments to pass to the metrics,
             see `log_string` for more details
         """
-        kwargs["level"] = logging.DEBUG
+        kwargs["level"] = "DEBUG"
         self.log_string(tag=tag, string=string, *args, **kwargs)
 
     def info(self, tag, string, *args, **kwargs):
@@ -1166,7 +1138,7 @@ def info(self, tag, string, *args, **kwargs):
         :param kwargs: additional arguments to pass to the metrics,
             see `log_string` for more details
         """
-        kwargs["level"] = logging.INFO
+        kwargs["level"] = "INFO"
         self.log_string(tag=tag, string=string, *args, **kwargs)
 
     def warning(self, tag, string, *args, **kwargs):
@@ -1181,7 +1153,7 @@ def warning(self, tag, string, *args, **kwargs):
         :param kwargs: additional arguments to pass to the metrics,
             see `log_string` for more details
         """
-        kwargs["level"] = logging.WARNING
+        kwargs["level"] = "WARNING"
         self.log_string(tag=tag, string=string, *args, **kwargs)
 
     def warn(self, tag, string, *args, **kwargs):
@@ -1204,7 +1176,7 @@ def error(self, tag, string, *args, **kwargs):
         :param kwargs: additional arguments to pass to the metrics,
             see `log_string` for more details
         """
-        kwargs["level"] = logging.ERROR
+        kwargs["level"] = "ERROR"
         self.log_string(tag=tag, string=string, *args, **kwargs)
 
     def critical(self, tag, string, *args, **kwargs):
@@ -1219,7 +1191,7 @@ def critical(self, tag, string, *args, **kwargs):
         :param kwargs: additional arguments to pass to the metrics,
             see `log_string` for more details
         """
-        kwargs["level"] = logging.CRITICAL
+        kwargs["level"] = "CRITICAL"
         self.log_string(tag=tag, string=string, *args, **kwargs)
 
 
@@ -1232,11 +1204,12 @@ def log_hyperparams(
         self,
         params: Dict,
         log_types: Union[str, List[str]] = ALL_TOKEN,
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ):
         """
         :param params: Each key-value pair in the dictionary is the name of the
             hyper parameter and it's corresponding value.
+        :param level: minimum severity level for the log message
         """
         for log in self.loggers:
             if log.enabled and (log_types == ALL_TOKEN or log.name in log_types):
@@ -1249,13 +1222,14 @@ def log_scalar(
         step: Optional[int] = None,
         wall_time: Optional[float] = None,
         log_types: Union[str, List[str]] = ALL_TOKEN,
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ):
         """
         :param tag: identifying tag to log the value with
         :param value: value to save
         :param step: global step for when the value was taken
         :param wall_time: global wall time for when the value was taken
+        :param level: minimum severity level for the log message
         :param kwargs: additional logging arguments to support Python and custom loggers
         :return: True if logged, False otherwise.
         """
@@ -1276,13 +1250,14 @@ def log_scalars(
         step: Optional[int] = None,
         wall_time: Optional[float] = None,
         log_types: Union[str, List[str]] = ALL_TOKEN,
-        level: Optional[int] = None,
+        level: Optional[Union[int, str]] = None,
     ):
         """
         :param tag: identifying tag to log the values with
         :param values: values to save
         :param step: global step for when the values were taken
         :param wall_time: global wall time for when the values were taken
+        :param level: minimum severity level for the log message
         :param kwargs: additional logging arguments to support Python and custom loggers
         :return: True if logged, False otherwise.
         """
diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py
index 4b885ff64..c5de440ce 100644
--- a/src/llmcompressor/modeling/deepseek_v3.py
+++ b/src/llmcompressor/modeling/deepseek_v3.py
@@ -3,6 +3,10 @@
 
 
 class DeepseekV3MoECalibrate(torch.nn.Module):
+    """
+    Patched DeepseekV3MoE which sends all tokens to all experts for calibration
+    """
+
     def __init__(self, config, experts, gate, shared_experts):
         super().__init__()
         self.config = config
diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py
index a8dedf8ee..6944327b0 100644
--- a/src/llmcompressor/modeling/prepare.py
+++ b/src/llmcompressor/modeling/prepare.py
@@ -1,22 +1,20 @@
-import torch
+from compressed_tensors.utils import replace_module
 from transformers import PreTrainedModel
-from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE
 
 from llmcompressor.modeling.deepseek_v3 import replace as replace_DeepseekV3MoE
-from llmcompressor.utils.module import module_bfs
 
-__all__ = ["prepare_for_quantization"]
+__all__ = ["prepare_for_calibration"]
 
 replacements = {
-    DeepseekV3MoE: replace_DeepseekV3MoE,
+    "DeepseekV3MoE": replace_DeepseekV3MoE,
 }
 
 
-def prepare_for_quantization(model: PreTrainedModel) -> PreTrainedModel:
-    def replace(module: torch.nn.Module) -> torch.nn.Module:
-        if module.__class__ in replacements:
-            return replacements[module.__class__](module)
-        else:
-            return module
+def prepare_for_calibration(model: PreTrainedModel) -> PreTrainedModel:
+    for name, module in model.named_modules():
+        cls_name = module.__class__.__name__
+        if cls_name in replacements:
+            new_module = replacements[cls_name](module)
+            replace_module(model, name, new_module)
 
-    return module_bfs(model, replace, progress=True)
+    return model
diff --git a/src/llmcompressor/modifiers/obcq/sgpt_sparsify.py b/src/llmcompressor/modifiers/obcq/sgpt_sparsify.py
index c43014a72..f327a4c34 100644
--- a/src/llmcompressor/modifiers/obcq/sgpt_sparsify.py
+++ b/src/llmcompressor/modifiers/obcq/sgpt_sparsify.py
@@ -3,6 +3,7 @@
 
 import torch
 import transformers
+from loguru import logger
 
 SGPT_PRECISION = torch.float32
 
@@ -108,11 +109,12 @@ def sparsify_weight(
         H = torch.linalg.cholesky(H, upper=True)
         Hinv = H
     except torch._C._LinAlgError:
-        raise torch._C._LinAlgError(
+        logger.warning(
             "Failed to invert hessian due to numerical instability. Consider "
             "increasing SparseGPTModifier.dampening_frac, increasing the number "
             "of calibration samples, or shuffling the calibration dataset"
         )
+        Hinv = H = torch.eye(num_columns, dtype=H.dtype, device=H.device)
 
     # sparsity mask
     # TODO: consider computing sparsity mask in the same way and place as gptq
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
index 63e1c2a24..b10a4cb31 100644
--- a/src/llmcompressor/modifiers/quantization/calibration.py
+++ b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -18,6 +18,12 @@
 from llmcompressor.observers import Observer
 from llmcompressor.utils.helpers import getattr_chain
 
+DEFAULT_MAXSHRINK = 0.20
+DEFAULT_PATIENCE = 5
+DEFAULT_AVERAGING_CONSTANT = 0.01
+DEFAULT_GRID = 100.0
+DEFAULT_NORM = 2.4
+
 __all__ = [
     "initialize_observer",
     "update_weight_zp_scale",
@@ -60,9 +66,18 @@ def initialize_observer(
         False,
         DynamicType.LOCAL,
     ):
+        observer_kwargs = quantization_args.observer_kwargs or {}
         observer = Observer.load_from_registry(
             quantization_args.observer,
             quantization_args=quantization_args,
+            averaging_constant=observer_kwargs.get(
+                "averaging_constant", DEFAULT_AVERAGING_CONSTANT
+            ),
+            # used by mse observer only, will be ignored by minmax observer
+            maxshrink=observer_kwargs.get("maxshrink", DEFAULT_MAXSHRINK),
+            patience=observer_kwargs.get("patience", DEFAULT_PATIENCE),
+            grid=observer_kwargs.get("grid", DEFAULT_GRID),
+            norm=observer_kwargs.get("norm", DEFAULT_NORM),
         )
         module.register_module(f"{base_name}_observer", observer)
 
diff --git a/src/llmcompressor/observers/min_max.py b/src/llmcompressor/observers/min_max.py
index eca56b6c4..ce5c0e779 100644
--- a/src/llmcompressor/observers/min_max.py
+++ b/src/llmcompressor/observers/min_max.py
@@ -22,6 +22,7 @@ def __init__(
         self,
         quantization_args: QuantizationArgs,
         averaging_constant: float = 0.01,
+        **kwargs,
     ):
         super().__init__(quantization_args=quantization_args)
 
diff --git a/src/llmcompressor/observers/mse.py b/src/llmcompressor/observers/mse.py
index 73e70c202..419155f07 100644
--- a/src/llmcompressor/observers/mse.py
+++ b/src/llmcompressor/observers/mse.py
@@ -20,18 +20,19 @@ class MovingAverageMSEObserver(Observer):
     def __init__(
         self,
         quantization_args: QuantizationArgs,
+        maxshrink: float = 0.2,
+        patience: int = 5,
         averaging_constant: float = 0.01,
         grid: float = 100.0,
         norm: float = 2.4,
+        **kwargs,
     ):
         super().__init__(quantization_args=quantization_args)
 
-        kwargs = quantization_args.observer_kwargs or {}
-        self.maxshrink = kwargs.get("maxshrink", 0.20)
-        self.patience = kwargs.get("patience", 5)
-
         self.min_val = {}
         self.max_val = {}
+        self.maxshrink = maxshrink
+        self.patience = patience
         self.averaging_constant = averaging_constant
         self.grid = grid
         self.norm = norm
diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
index 87f463bfe..605358ae9 100644
--- a/src/llmcompressor/pipelines/basic/pipeline.py
+++ b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -2,6 +2,7 @@
 
 import torch
 import tqdm
+from compressed_tensors.utils import get_execution_device
 from torch.utils.data.dataloader import DataLoader
 
 from llmcompressor.core import LifecycleCallbacks
@@ -9,7 +10,6 @@
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pytorch.utils.helpers import tensors_to_device
 from llmcompressor.utils import calibration_forward_context, dispatch_for_generation
-from llmcompressor.utils.dev import infer_model_device
 
 if TYPE_CHECKING:
     from llmcompressor.args.dataset_arguments import DatasetArguments
@@ -38,9 +38,7 @@ def __call__(
         :param dataset_args: dataset arguments relevant to pipelines
         """
         dispatch_for_generation(model)  # basic dispatch is identical to generation
-        model_device = getattr(dataset_args, "model_input_device")
-        if model_device is None:
-            model_device = infer_model_device(model)
+        model_device = get_execution_device(model)
 
         LifecycleCallbacks.calibration_epoch_start()
 
diff --git a/src/llmcompressor/pipelines/layer_sequential/pipeline.py b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
index 089f7dc8c..e5a608708 100644
--- a/src/llmcompressor/pipelines/layer_sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/layer_sequential/pipeline.py
@@ -2,7 +2,7 @@
 
 import torch
 import tqdm
-from compressed_tensors.utils import disable_offloading
+from compressed_tensors.utils import disable_offloading, get_execution_device
 from torch.utils.data.dataloader import DataLoader
 
 from llmcompressor.core import LifecycleCallbacks, active_session
@@ -19,7 +19,6 @@
     dispatch_for_sequential,
     get_sequential_targets,
 )
-from llmcompressor.utils.dev import infer_model_device
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
 if TYPE_CHECKING:
@@ -61,9 +60,7 @@ def __call__(
 
         # prepare model for sequential onloading
         dispatch_for_sequential(model)
-        model_device = getattr(dataset_args, "model_input_device")
-        if model_device is None:
-            model_device = infer_model_device(model)
+        model_device = get_execution_device(model)
 
         # find layers
         modifiers = session.get_modifiers()
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
index c9ce9cd30..9a2b8f3c9 100644
--- a/src/llmcompressor/pipelines/sequential/pipeline.py
+++ b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -1,7 +1,7 @@
 from typing import TYPE_CHECKING
 
 import torch
-from compressed_tensors.utils import disable_offloading
+from compressed_tensors.utils import disable_offloading, get_execution_device
 from torch.utils.data.dataloader import DataLoader
 from tqdm import tqdm
 
@@ -14,7 +14,6 @@
     get_sequential_targets,
     trace_subgraphs,
 )
-from llmcompressor.utils.dev import infer_model_device
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
 
 if TYPE_CHECKING:
@@ -55,9 +54,7 @@ def __call__(
 
         # prepare model for sequential onloading
         dispatch_for_sequential(model)
-        model_device = getattr(dataset_args, "model_input_device")
-        if model_device is None:
-            model_device = infer_model_device(model)
+        model_device = get_execution_device(model)
 
         # prepare to trace subgraphs
         modifiers = session.get_modifiers()
diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
index 4832e3b7f..0465b2a8d 100644
--- a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
+++ b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
@@ -1,11 +1,9 @@
 import os
-import re
 import weakref
 from functools import wraps
 from typing import Optional
 
 import torch
-import transformers
 from accelerate.accelerator import get_state_dict_offloaded_model
 from compressed_tensors import (
     CompressionFormat,
@@ -86,11 +84,6 @@ def save_pretrained_wrapper(
             :param kwargs: additional kwargs to pass on to model.save_pretrained
             """
 
-            # HACK: Override the dtype_byte_size function in transformers to
-            # support float8 types. Fix is posted upstream
-            # https://github.com/huggingface/transformers/pull/30488
-            transformers.modeling_utils.dtype_byte_size = new_dtype_byte_size
-
             # compress model using compressor
             compressor = get_model_compressor(
                 model=model,
@@ -128,18 +121,6 @@ def save_pretrained_wrapper(
         model.save_pretrained = save_pretrained_compressed(model.save_pretrained)
 
 
-# HACK: Override the dtype_byte_size function in transformers to support float8 types
-# Fix is posted upstream https://github.com/huggingface/transformers/pull/30488
-def new_dtype_byte_size(dtype):
-    if dtype == torch.bool:
-        return 1 / 8
-    bit_search = re.search(r"[^\d](\d+)_?", str(dtype))
-    if bit_search is None:
-        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
-    bit_size = int(bit_search.groups()[0])
-    return bit_size // 8
-
-
 def patch_tied_tensors_bug(model: torch.nn.Module):
     """
     Patches bug where HF transformers will fail to untie weights under specific
diff --git a/src/llmcompressor/utils/dev.py b/src/llmcompressor/utils/dev.py
index 558ef816e..f0feb6c04 100644
--- a/src/llmcompressor/utils/dev.py
+++ b/src/llmcompressor/utils/dev.py
@@ -7,7 +7,7 @@
 import torch
 from accelerate import dispatch_model, infer_auto_device_map
 from accelerate.utils import get_balanced_memory
-from compressed_tensors.utils import has_offloaded_params, remove_dispatch
+from compressed_tensors.utils import remove_dispatch
 from huggingface_hub import snapshot_download
 from safetensors.torch import save_file
 from transformers import AutoModelForCausalLM, PreTrainedModel
@@ -20,7 +20,6 @@
     "skip_weights_download",
     "patch_transformers_logger_level",
     "dispatch_for_generation",
-    "infer_model_device",
 ]
 
 
@@ -141,28 +140,3 @@ def dispatch_for_generation(model: PreTrainedModel) -> PreTrainedModel:
     )
 
     return dispatch_model(model, device_map=device_map)
-
-
-def infer_model_device(model: PreTrainedModel) -> torch.device:
-    """
-    Gets the model's execution device (the device that model inputs should be on)
-    using non-guaranteed but reasonable assumptions about module and parameter order.
-
-    If a model is offloaded, assume that modules execute in the same order
-    that they are returned by `model.modules()`
-
-    If a model is not offloaded, assume that parameters are used in the same order
-    that they are returned by `model.parameters()`
-
-    :param model: model whose execution device is being inferred
-    :return: device which model inputs should be put on
-    """
-    for module in model.modules():
-        if has_offloaded_params(module):
-            return module._hf_hook.execution_device
-
-    first_param = next(model.parameters(), None)
-    if first_param is None:
-        return torch.device("cpu")
-
-    return first_param.device

From 863377ea1dc4333bc35ffa7817d32111e11ca4b2 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 26 Jun 2025 14:27:30 -0400
Subject: [PATCH 21/22] remove extra file

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/utils/module.py | 27 ---------------------------
 1 file changed, 27 deletions(-)
 delete mode 100644 src/llmcompressor/utils/module.py

diff --git a/src/llmcompressor/utils/module.py b/src/llmcompressor/utils/module.py
deleted file mode 100644
index a02aa8b4a..000000000
--- a/src/llmcompressor/utils/module.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from typing import Callable, Union
-
-import torch
-import tqdm
-
-__all__ = ["module_bfs"]
-
-
-def module_bfs(
-    module: torch.nn.Module,
-    func: Callable[[torch.nn.Module], torch.nn.Module],
-    pre: bool = True,
-    progress: Union[bool, tqdm.tqdm] = False,
-) -> torch.nn.Module:
-    if progress is True:
-        total = len(list(module.modules()))
-        progress = tqdm.tqdm(total=total)
-    if pre:
-        module = func(module)
-    for name, child in list(module.named_children()):
-        module.add_module(name, module_bfs(child, func, pre, progress))
-    if not pre:
-        module = func(module)
-    if isinstance(progress, tqdm.tqdm):
-        progress.update(1)
-
-    return module

From 2f5de103abc4cf5fdd9197af9029ef8182021a5a Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 26 Jun 2025 14:31:24 -0400
Subject: [PATCH 22/22] convert to fp8 examples

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/quantizing_moe/README.md          | 14 +++++++-------
 examples/quantizing_moe/mixtral_example.py |  4 ++--
 examples/quantizing_moe/qwen_example.py    |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/quantizing_moe/README.md b/examples/quantizing_moe/README.md
index f2a162d91..8a9b257f4 100644
--- a/examples/quantizing_moe/README.md
+++ b/examples/quantizing_moe/README.md
@@ -1,6 +1,6 @@
-# Quantizing Mixtral-8x7B-Instruct-v0.1 Model with W4A16
+# Quantizing Mixtral-8x7B-Instruct-v0.1 Model with FP8
 
-This directory contains an example script for quantizing the `Mixtral-8x7B-Instruct-v0.1` model using the static per-tensor W4A16 quantization scheme.
+This directory contains an example script for quantizing the `Mixtral-8x7B-Instruct-v0.1` model using the static per-tensor FP8 quantization scheme.
 
 ## Installation
 
@@ -22,7 +22,7 @@ python3 mixtral_example.py
 
 ## Creating a Quantized MoE Model
 
-This example leverages `llm-compressor` and `compressed-tensors` to create an W4A16-quantized `Mixtral-8x7B-Instruct-v0.1` model. The model is calibrated and trained using the `ultrachat_200k` dataset.
+This example leverages `llm-compressor` and `compressed-tensors` to create an FP8-quantized `Mixtral-8x7B-Instruct-v0.1` model. The model is calibrated and trained using the `ultrachat_200k` dataset.
 
 You can follow the detailed steps below or simply run the example script with:
 
@@ -36,24 +36,24 @@ In this step, you'll choose a baseline model for quantization, a dataset for cal
 
 - **Models**: Can be referenced from a local directory or retrieved from the Hugging Face Hub.
 - **Datasets**: Can also be from a local directory or the Hugging Face Hub.
-- **Recipes**: These are YAML files or Python modifier objects that describe how a model should be optimized during or after training. In this example, we use a `QuantizationModifier` object with the scheme set to `W4A16`.
+- **Recipes**: These are YAML files or Python modifier objects that describe how a model should be optimized during or after training. In this example, we use a `QuantizationModifier` object with the scheme set to `FP8`.
 
 ```python
 from llmcompressor.modifiers.quantization import QuantizationModifier
 
-recipe = QuantizationModifier(scheme="W4A16", targets="Linear", ignore=["lm_head", "re:.*block_sparse_moe.gate"])
+recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=["lm_head", "re:.*block_sparse_moe.gate"])
 ```
 
 NOTE: `.*block_sparse_moe.gate` layers do not quantize well, hence they are ignored!
 
 ### Step 2: Run Quantization Using Oneshot
 
-The `oneshot` method applies the selected recipe to your model and dataset without requiring any fine-tuning. The model will be sparsified and saved to `Mixtral-8x7B-Instruct-v0.1-W4A16-G128`.
+The `oneshot` method applies the selected recipe to your model and dataset without requiring any fine-tuning. The model will be sparsified and saved to `Mixtral-8x7B-Instruct-v0.1-FP8`.
 
 ```python
 from llmcompressor import oneshot
 
-output_dir = "Mixtral-8x7B-Instruct-v0.1-W4A16-G128"
+output_dir = "Mixtral-8x7B-Instruct-v0.1-FP8"
 
 oneshot(
     model=model,
diff --git a/examples/quantizing_moe/mixtral_example.py b/examples/quantizing_moe/mixtral_example.py
index 49b08c722..5021c7947 100644
--- a/examples/quantizing_moe/mixtral_example.py
+++ b/examples/quantizing_moe/mixtral_example.py
@@ -55,7 +55,7 @@ def tokenize(sample):
 # since the MoE gate layers are sensitive to quantization, we add them to the ignore
 # list so they remain at full precision
 recipe = QuantizationModifier(
-    scheme="W4A16",
+    scheme="FP8",
     targets="Linear",
     ignore=[
         "lm_head",
@@ -81,6 +81,6 @@ def tokenize(sample):
 print("==========================================")
 
 # Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/qwen_example.py b/examples/quantizing_moe/qwen_example.py
index 2531e6528..bb00b530e 100644
--- a/examples/quantizing_moe/qwen_example.py
+++ b/examples/quantizing_moe/qwen_example.py
@@ -56,7 +56,7 @@ def tokenize(sample):
 # list so they remain at full precision
 recipe = GPTQModifier(
     targets="Linear",
-    scheme="W4A16",
+    scheme="FP8",
     ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
 )
 
@@ -80,6 +80,6 @@ def tokenize(sample):
 print("==========================================")
 
 # Save to disk in compressed-tensors format.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)