deepseekv3

kylesayrs · kylesayrs · commit b30eade3ed2b · 2025-06-19T10:55:47.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/examples/quantizing_moe/deepseekv3_example.py b/examples/quantizing_moe/deepseekv3_example.py
@@ -0,0 +1,85 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor.modeling import prepare_for_quantization
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.transformers import oneshot
+
+# Select model and load it.
+model_id = "RedHatAI/DeepSeek-V3-BF16"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = prepare_for_quantization(model)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# Configure the quantization algorithm to run.
+#   * quantize the weights to 4 bit with GPTQ with a group size 128
+recipe = GPTQModifier(
+    targets="Linear",
+    scheme="W4A16",
+    ignore=["lm_head"],
+    sequential_targets=["DeepseekV3Attention", "DeepseekV3MLP"],
+)
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[-1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -2,6 +2,8 @@
 from datetime import datetime
 from typing import TYPE_CHECKING, List, Optional, Union
 
+import torch
+from compressed_tensors.utils import offloaded_dispatch
 from loguru import logger
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin
@@ -127,6 +129,14 @@ def __init__(
         # initialize the model and processor
         pre_process(model_args)
 
+        # offload to cpu if possible
+        if "cuda" in str(model_args.oneshot_device) and torch.cuda.is_available():
+            offloaded_dispatch(
+                model_args.model, execution_device=model_args.oneshot_device
+            )
+        else:
+            logger.warning("CUDA is not available! Compressing model on CPU instead")
+
         # Set instance attributes
         self.model = self.model_args.model
         self.processor = self.model_args.processor
diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+
+from .prepare import *
diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py
@@ -0,0 +1,48 @@
+import torch
+from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE
+
+
+class DeepseekV3MoECalibrate(torch.nn.Module):
+    def __init__(self, config, experts, gate, shared_experts):
+        super().__init__()
+        self.config = config
+        self.experts = experts
+        self.gate = gate
+        self.shared_experts = shared_experts
+
+    def forward(self, hidden_states):
+        residuals = hidden_states
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+
+        # Begin MoE
+        final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+        expert_mask = torch.nn.functional.one_hot(
+            topk_indices, num_classes=len(self.experts)
+        )
+        expert_mask = expert_mask.permute(2, 0, 1)
+
+        for expert_idx in range(len(self.experts)):
+            expert = self.experts[expert_idx]
+            mask = expert_mask[expert_idx]
+            token_indices, weight_indices = torch.where(mask)
+
+            expert_weights = topk_weights[token_indices, weight_indices]
+            expert_input = hidden_states[token_indices]
+            expert_output = expert(expert_input)
+            weighted_output = expert_output * expert_weights.unsqueeze(-1)
+
+            if token_indices.numel() > 0:
+                final_hidden_states.index_add_(0, token_indices, weighted_output)
+        # End MoE
+
+        hidden_states = final_hidden_states.type(hidden_states.dtype).view(*orig_shape)
+        hidden_states = hidden_states + self.shared_experts(residuals)
+        return hidden_states
+
+
+def replace(module: DeepseekV3MoE) -> DeepseekV3MoECalibrate:
+    return DeepseekV3MoECalibrate(
+        module.config, module.experts, module.gate, module.shared_experts
+    )
diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py
@@ -0,0 +1,22 @@
+import torch
+from transformers import PreTrainedModel
+from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3MoE
+
+from llmcompressor.modeling.deepseek_v3 import replace as replace_DeepseekV3MoE
+from llmcompressor.utils.module import module_bfs
+
+__all__ = ["prepare_for_quantization"]
+
+replacements = {
+    DeepseekV3MoE: replace_DeepseekV3MoE,
+}
+
+
+def prepare_for_quantization(model: PreTrainedModel) -> PreTrainedModel:
+    def replace(module: torch.nn.Module) -> torch.nn.Module:
+        if module.__class__ in replacements:
+            return replacements[module.__class__](module)
+        else:
+            return module
+
+    return module_bfs(model, replace, progress=True)
diff --git a/src/llmcompressor/utils/module.py b/src/llmcompressor/utils/module.py
@@ -0,0 +1,27 @@
+from typing import Callable, Union
+
+import torch
+import tqdm
+
+__all__ = ["module_bfs"]
+
+
+def module_bfs(
+    module: torch.nn.Module,
+    func: Callable[[torch.nn.Module], torch.nn.Module],
+    pre: bool = True,
+    progress: Union[bool, tqdm.tqdm] = False,
+) -> torch.nn.Module:
+    if progress is True:
+        total = len(list(module.modules()))
+        progress = tqdm.tqdm(total=total)
+    if pre:
+        module = func(module)
+    for name, child in list(module.named_children()):
+        module.add_module(name, module_bfs(child, func, pre, progress))
+    if not pre:
+        module = func(module)
+    if isinstance(progress, tqdm.tqdm):
+        progress.update(1)
+
+    return module

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# flake8: noqa`
	`2`	`+`
	`3`	`+from .prepare import *`