rebase; fix

dsikka · dsikka · commit 528cdc833506 · 2025-07-14T20:56:40.000Z
diff --git a/examples/quantization_w4a4_fp4/qwen_30b_a3b.py b/examples/quantization_w4a4_fp4/qwen_30b_a3b.py
@@ -15,8 +15,7 @@
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
 DATASET_SPLIT = "train_sft"
 
-# Select number of samples. 512 samples is a good place to start.
-# Increasing the number of samples can improve accuracy.
+# Select number of samples
 NUM_CALIBRATION_SAMPLES = 20
 MAX_SEQUENCE_LENGTH = 2048
 
diff --git a/src/llmcompressor/modeling/deepseek_v3.py b/src/llmcompressor/modeling/deepseek_v3.py
@@ -1,10 +1,11 @@
 import torch
 from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config
 from transformers.models.deepseek_v3.modeling_deepseek_v3 import (
-    DeepseekV3MoE as OriginalDeepseekV3MoE
+    DeepseekV3MoE as OriginalDeepseekV3MoE,
 )
 
-class DeepseekV3MoE(torch.nn.Module):
+
+class DeepseekV3MoECalibrate(torch.nn.Module):
     """
     Patched DeepseekV3MoE which sends all tokens to all experts for calibration
     """
diff --git a/src/llmcompressor/modeling/llama4.py b/src/llmcompressor/modeling/llama4.py
@@ -11,8 +11,6 @@
 
 from llmcompressor.utils.dev import skip_weights_initialize
 
-__all__ = ["SequentialLlama4TextMoe"]
-
 
 class SequentialLlama4TextMoe(torch.nn.Module):
     def __init__(self, config: Llama4TextConfig, original: Llama4TextMoe):
diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py
@@ -34,27 +34,19 @@ def update_qwen3_moe(model, stack):
         if cls_name == "Qwen3MoeDecoderLayer":
             # Optionally update the model.config to pass in other arguments
             stack.enter_context(
-                patch_attr(module, "mlp", replace_Qwen3MoE(model.config, module.mlp))
-            )
-
-
-def update_deepseek3_moe(model, stack):
-    for module in model.modules():
-        cls_name = module.__class__.__name__
-        if (
-            cls_name == "DeepseekV3DecoderLayer"
-            and module.mlp.__class__.__name__ == "DeepseekV3MoE"
-        ):
-            stack.enter_context(
-                patch_attr(module, "mlp", replace_DeepseekV3MoE(module.mlp))
+                patch_attr(
+                    module,
+                    "mlp",
+                    replace_Qwen3MoE(config=model.config, module=module.mlp),
+                )
             )
 
 
 moe_context = {
     "Qwen3MoeForCausalLM": update_qwen3_moe,
-    # "DeepseekV3ForCausalLM": update_deepseek3_moe, TODO: uncomment when tested
 }
 
+
 def moe_calibration_context(model: PreTrainedModel, stack):
     # Temporarily updates the MoE modules within the context
     # Once the context exists, parameter updates persist
diff --git a/src/llmcompressor/modeling/qwen3_moe.py b/src/llmcompressor/modeling/qwen3_moe.py
@@ -15,21 +15,26 @@
 # limitations under the License.
 
 import torch
+from transformers.models import Qwen3MoeConfig
+from transformers.models.qwen3_moe.modeling_qwen3_moe import (
+    Qwen3MoeSparseMoeBlock as OriginalQwen3MoeSparseMoeBlock,
+)
 
 
 class Qwen3MoeSparseMoeBlock(torch.nn.Module):
-    def __init__(self, config, gate, experts):
+    def __init__(
+        self, config: Qwen3MoeConfig, original: OriginalQwen3MoeSparseMoeBlock
+    ):
         super().__init__()
         self.num_experts = config.num_experts
-        self.top_k = config.num_experts
+        self.top_k = config.top_k
         self.norm_topk_prob = config.norm_topk_prob
 
         # gating
-        self.gate = gate
-        self.experts = experts
+        self.gate = original.gate
+        self.experts = original.experts
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        """ """
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (batch * sequence_length, n_experts)
@@ -81,5 +86,5 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return final_hidden_states, router_logits
 
 
-def replace(config, module):
-    return Qwen3MoeSparseMoeBlock(config, module.gate, module.experts)
+def replace(config: Qwen3MoeConfig, module: OriginalQwen3MoeSparseMoeBlock):
+    return Qwen3MoeSparseMoeBlock(config=config, original=module)