[Model] Fix minimax model cache & lm_head precision (#19592)

qscqesze · web-flow · commit a24cb91600bd · 2025-06-13T12:08:20.000Z
Signed-off-by: qingjun &lt;qingjun@minimaxi.com&gt;
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
@@ -856,7 +856,7 @@ def layer_fn(prefix):
         self._dtype = _dummy.dtype
         del _dummy
 
-        self.minimax_cache = MinimaxCacheManager(dtype=self._dtype,
+        self.minimax_cache = MinimaxCacheManager(dtype=torch.float32,
                                                  cache_shape=self.cache_shape)
 
         rope_theta = getattr(config, "rope_theta", 10000)
@@ -1021,7 +1021,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
         else:
             self.lm_head = PPMissingLayer()
-
+        self.lm_head.float()
         flash_layer_count = sum(1 for attn_type in self.config.attn_type_list
                                 if attn_type == 1)
         self.kv_cache = [torch.tensor([]) for _ in range(flash_layer_count)]
@@ -1054,7 +1054,7 @@ def forward(self,
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states.float(),
                                        sampling_metadata)
 
         return logits