introduce Mamba2Layer abstraction to allow mamba layers other than MambaMixer2 in v1 engine

nopperl · nopperl · commit 14a4fa70fc05 · 2025-07-09T03:40:28.000Z
Signed-off-by: nopperl &lt;54780682+nopperl@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
 from typing import Optional, Union
 
 import torch
@@ -216,9 +218,34 @@ def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
     return loader
 
 
+class Mamba2Layer(ABC):
+    """
+    Base class for all Mamba2 layers which support the v1 engine.
+    Inherit from this class if you implement a custom Mamba2 layer.
+    """
+
+    chunk_size: int
+
+    # Contains the KV cache (mamba state) for the layer
+    # in the shape specified by `self.get_state_shape`.
+    # The outer list is for v0 PP virtual engine. Though this code path
+    # only runs for v1, we have to do this to unify with the interface
+    # of Attention + v0 PP.
+    kv_cache: list[tuple[torch.Tensor]]
+
+    @abstractmethod
+    def get_state_shape(self) -> Iterable[tuple[int, ...]]:
+        """
+        Defines the shape of the mamba state.
+        Usually, the mamba state is a (conv_state, ssm_state) tuple.
+        In this case, returns (conv_state_shape, ssm_state_shape).
+        """
+        pass
+
+
 # Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
 @CustomOp.register("mamba_mixer2")
-class MambaMixer2(CustomOp):
+class MambaMixer2(Mamba2Layer, CustomOp):
     """
     Compute ∆, A, B, C, and D the state space parameters and compute
     the `contextualized_states`. A, D are input independent
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
@@ -21,8 +21,8 @@
 
 
 def get_mamba2_chunk_size(vllm_config: VllmConfig) -> int:
-    from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
-    layers = get_layers_from_vllm_config(vllm_config, MambaMixer2)
+    from vllm.model_executor.layers.mamba.mamba_mixer2 import Mamba2Layer
+    layers = get_layers_from_vllm_config(vllm_config, Mamba2Layer)
     chunk_sizes = set(layer.chunk_size for layer in layers.values())
     assert len(
         chunk_sizes) == 1, "All Mamba2 layers must have the same chunk size"
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -31,7 +31,7 @@
 from vllm.forward_context import (DPMetadata, get_forward_context,
                                   set_forward_context)
 from vllm.logger import init_logger
-from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
+from vllm.model_executor.layers.mamba.mamba_mixer2 import Mamba2Layer
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
 from vllm.model_executor.models.interfaces import (has_step_pooler,
@@ -2660,7 +2660,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
                     f"Unknown attention type: {attn_module.attn_type}")
 
         mamba_layers = get_layers_from_vllm_config(self.vllm_config,
-                                                   MambaMixer2)
+                                                   Mamba2Layer)
         if len(mamba_layers) > 0:
             if self.vllm_config.speculative_config is not None:
                 raise NotImplementedError(
@@ -2691,7 +2691,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
     def _maybe_pad_mamba_page_size(
         self,
         attn_layers: dict[str, Attention],
-        mamba_layers: dict[str, MambaMixer2],
+        mamba_layers: dict[str, Mamba2Layer],
         kv_cache_spec: dict[str, KVCacheSpec],
         max_model_len: int,
         block_size: int,