vllm-project
diff --git a/‎docs/usage/v1_guide.md
Lines changed: 1 addition & 2 deletions b/‎docs/usage/v1_guide.md
Lines changed: 1 addition & 2 deletions
diff --git a/‎tests/models/language/generation/test_hybrid.py
Lines changed: 1 addition & 15 deletions b/‎tests/models/language/generation/test_hybrid.py
Lines changed: 1 addition & 15 deletions
diff --git a/‎vllm/config.py
Lines changed: 8 additions & 1 deletion b/‎vllm/config.py
Lines changed: 8 additions & 1 deletion
diff --git a/‎vllm/model_executor/layers/mamba/mamba_mixer2.py
Lines changed: 10 additions & 38 deletions b/‎vllm/model_executor/layers/mamba/mamba_mixer2.py
Lines changed: 10 additions & 38 deletions
diff --git a/‎vllm/model_executor/layers/mamba/mamba_utils.py
Lines changed: 55 additions & 0 deletions b/‎vllm/model_executor/layers/mamba/mamba_utils.py
Lines changed: 55 additions & 0 deletions
diff --git a/‎vllm/model_executor/models/bamba.py
Lines changed: 42 additions & 39 deletions b/‎vllm/model_executor/models/bamba.py
Lines changed: 42 additions & 39 deletions
@@ -112,8 +112,7 @@ enforcing eager mode and disabling prefix caching in V1.
 Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
 `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`). Please note that
 these models currently require enforcing eager mode, disabling prefix caching, and using the FlashInfer attention
-backend in V1. It is also necessary to pass a non-standard block size for attention layers (this is not possible
-using the `vllm serve` CLI yet).
+backend in V1.
 
 #### Encoder-Decoder Models
 
 
@@ -61,14 +61,6 @@
     "tiiuae/Falcon-H1-0.5B-Base",
 ]
 
-ATTN_BLOCK_SIZES = {
-    "ibm-ai-platform/Bamba-9B-v1": 528,
-    "Zyphra/Zamba2-1.2B-instruct": 80,
-    "nvidia/Nemotron-H-8B-Base-8K": 528,
-    "ibm-granite/granite-4.0-tiny-preview": 400,
-    "tiiuae/Falcon-H1-0.5B-Base": 800,
-}
-
 # Avoid OOM
 MAX_NUM_SEQS = 4
 
@@ -105,11 +97,6 @@ def test_models(
             example_prompts, max_tokens, num_logprobs)
 
     if model in V1_SUPPORTED_MODELS:
-        if model in HYBRID_MODELS and model in ATTN_BLOCK_SIZES:
-            block_size = ATTN_BLOCK_SIZES[model]
-        else:
-            block_size = 16
-
         with monkeypatch.context() as m:
             m.setenv("VLLM_USE_V1", "1")
             if model in HYBRID_MODELS:
@@ -118,8 +105,7 @@ def test_models(
             with vllm_runner(model,
                              max_num_seqs=MAX_NUM_SEQS,
                              enforce_eager=True,
-                             enable_prefix_caching=False,
-                             block_size=block_size) as vllm_model:
+                             enable_prefix_caching=False) as vllm_model:
                 vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
                     example_prompts, max_tokens, num_logprobs)
     else:
 
@@ -1630,6 +1630,9 @@ class CacheConfig:
     checkpoint if available. Otherwise, the scales will default to 1.0."""
     cpu_kvcache_space_bytes: Optional[int] = None
     """(CPU backend only) CPU key-value cache space."""
+    mamba_page_size_padded: Optional[int] = None
+    """ Optional override for mamba page size; used by hybrid mamba/attention
+    models to ensure exact alignment with attention page size."""
 
     # Will be set after profiling.
     num_gpu_blocks: Optional[int] = field(default=None, init=False)
@@ -4882,11 +4885,15 @@ def try_verify_and_update_config(self):
         if architecture is None:
             return
 
-        from vllm.model_executor.models.config import MODELS_CONFIG_MAP
+        from vllm.model_executor.models.config import (
+            MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig)
         cls = MODELS_CONFIG_MAP.get(architecture, None)
         if cls is not None:
             cls.verify_and_update_config(self)
 
+        if self.model_config.is_hybrid:
+            HybridAttentionMambaModelConfig.verify_and_update_config(self)
+
         if self.model_config.task == "classify":
             # Maybe convert ForCausalLM into ForSequenceClassification model.
             from vllm.model_executor.models.adapters import (
 
@@ -20,6 +20,8 @@
 from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.mamba.mamba2_metadata import (Mamba2Metadata,
                                                               update_metadata)
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    extra_groups_for_head_shards, get_mamba_state_shape)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_fn, causal_conv1d_update)
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
@@ -146,18 +148,6 @@ def forward_cuda(
         return out
 
 
-def extra_groups_for_head_shards(ngroups: int, tp_size: int):
-    """Compute the increase in group numbers to account for
-    replication in order to accompany the head shards."""
-
-    # in the case ngoups % tp_size == 0, this will be zero
-    if ngroups % tp_size == 0:
-        return 0
-
-    # for n_groups == 1, this is exactly tp_size - n_groups
-    return tp_size - ngroups
-
-
 def mamba_v2_sharded_weight_loader(
     shard_spec: list[tuple[int, int, float]],
     tp_size: int,
@@ -707,30 +697,12 @@ def forward_cuda(
         return out
 
     def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
-        world_size = get_tensor_model_parallel_world_size()
-
-        conv_state_shape, temporal_state_shape = None, None
-
-        # if n_groups is not divisible by world_size, need to extend the shards
-        # to ensure all groups needed by a head is sharded along with it
-        n_groups = (self.n_groups +
-                    extra_groups_for_head_shards(self.n_groups, world_size))
-
-        # - heads and n_groups are TP-ed
-        conv_dim = (self.intermediate_size +
-                    2 * n_groups * self.ssm_state_size)
-        # contiguous along 'dim' axis
-        conv_state_shape = (
-            self.conv_kernel_size - 1,
-            divide(conv_dim, world_size),
-        )
-
-        # These are not TP-ed as they depend on A, dt_bias, D
-        # - they are typically small
-        #   e.g., (h_heads, d_head, d_state) = (128, 64, 128)
-        temporal_state_shape = (
-            divide(self.num_heads, world_size),
-            self.head_dim,
-            self.ssm_state_size,
+        return get_mamba_state_shape(
+            intermediate_size=self.intermediate_size,
+            tp_world_size=get_tensor_model_parallel_world_size(),
+            n_groups=self.n_groups,
+            num_heads=self.num_heads,
+            head_dim=self.head_dim,
+            state_size=self.ssm_state_size,
+            conv_kernel=self.conv_kernel_size,
         )
-        return conv_state_shape, temporal_state_shape
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.distributed import divide
+
+
+def extra_groups_for_head_shards(ngroups: int, tp_size: int):
+    """Compute the increase in group numbers to account for
+    replication in order to accompany the head shards."""
+
+    # in the case ngoups % tp_size == 0, this will be zero
+    if ngroups % tp_size == 0:
+        return 0
+
+    # for n_groups == 1, this is exactly tp_size - n_groups
+    return tp_size - ngroups
+
+
+def get_mamba_state_shape(
+    intermediate_size: int,
+    tp_world_size: int,
+    n_groups: int,
+    num_heads: int,
+    head_dim: int,
+    state_size: int,
+    conv_kernel: int,
+    use_v1: bool = True,
+) -> tuple[tuple[int, int], tuple[int, int, int]]:
+    """ Get the shape of mamba state."""
+
+    # if n_groups is not divisible by world_size, need to extend the shards
+    # to ensure all groups needed by a head is sharded along with it
+    n_groups = (n_groups +
+                extra_groups_for_head_shards(n_groups, tp_world_size))
+
+    # - heads and n_groups are TP-ed
+    conv_dim = (intermediate_size + 2 * n_groups * state_size)
+    # contiguous along 'dim' axis
+    conv_state_shape = (
+        conv_kernel - 1,
+        divide(conv_dim, tp_world_size),
+    )
+
+    if not use_v1:
+        conv_state_shape = (conv_state_shape[1], conv_state_shape[0])
+
+    # These are not TP-ed as they depend on A, dt_bias, D
+    # - they are typically small
+    #   e.g., (h_heads, head_dim, state_size) = (128, 64, 128)
+    temporal_state_shape = (
+        divide(num_heads, tp_world_size),
+        head_dim,
+        state_size,
+    )
+
+    return conv_state_shape, temporal_state_shape
@@ -12,7 +12,7 @@
 from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -23,8 +23,8 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba2_metadata import (
     Mamba2Metadata, prepare_mamba2_metadata)
-from vllm.model_executor.layers.mamba.mamba_mixer2 import (
-    MambaMixer2, extra_groups_for_head_shards)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
+from vllm.model_executor.layers.mamba.mamba_utils import get_mamba_state_shape
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -435,6 +435,38 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
     }
     embedding_padding_modules = ["lm_head"]
 
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+        use_v1: bool = True,
+    ) -> tuple[tuple[int, int], tuple[int, int, int]]:
+        """Calculate shapes for Mamba's convolutional and state caches.
+
+        Args:
+            vllm_config: vLLM config
+            use_v1: Get shapes for V1 (or V0)
+
+        Returns:
+            Tuple containing:
+            - conv_state_shape: Shape for convolutional state cache
+            - temporal_state_shape: Shape for state space model cache
+        """
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+        intermediate_size = hf_config.mamba_expand * hf_config.hidden_size
+
+        return get_mamba_state_shape(
+            intermediate_size=intermediate_size,
+            tp_world_size=parallel_config.tensor_parallel_size,
+            n_groups=hf_config.mamba_n_groups,
+            num_heads=hf_config.mamba_n_heads,
+            head_dim=hf_config.mamba_d_head,
+            state_size=hf_config.mamba_d_state,
+            conv_kernel=hf_config.mamba_d_conv,
+            use_v1=use_v1,
+        )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
@@ -491,10 +523,13 @@ def forward(self,
                         self.vllm_config.parallel_config,
                         LayerBlockType.mamba
                     )
-
-                self.mamba_cache = MambaCacheManager(
-                    self.vllm_config, self.lm_head.weight.dtype,
-                    num_mamba_layers, *self._get_mamba_cache_shape())
+                mamba_state_shape = \
+                    self.get_mamba_state_shape_from_config(
+                        self.vllm_config, use_v1=False)
+                self.mamba_cache = MambaCacheManager(self.vllm_config,
+                                                     self.lm_head.weight.dtype,
+                                                     num_mamba_layers,
+                                                     *mamba_state_shape)
 
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
@@ -510,38 +545,6 @@ def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
 
-    def _get_mamba_cache_shape(
-            self) -> tuple[tuple[int, int], tuple[int, int]]:
-        world_size = get_tensor_model_parallel_world_size()
-        hidden_size = self.config.hidden_size
-
-        conv_state_shape, temporal_state_shape = None, None
-
-        intermediate_size = self.config.mamba_expand * hidden_size
-
-        # if n_groups is not divisible by world_size, need to extend the shards
-        # to ensure all groups needed by a head is sharded along with it
-        n_groups = (self.config.mamba_n_groups + extra_groups_for_head_shards(
-            self.config.mamba_n_groups, world_size))
-
-        # - heads and n_groups are TP-ed
-        conv_dim = (intermediate_size +
-                    2 * n_groups * self.config.mamba_d_state)
-        conv_state_shape = (
-            divide(conv_dim, world_size),
-            self.config.mamba_d_conv - 1,
-        )
-
-        # These are not TP-ed as they depend on A, dt_bias, D
-        # - they are typically small
-        #   e.g., (h_heads, d_head, d_state) = (128, 64, 128)
-        temporal_state_shape = (
-            divide(self.config.mamba_n_heads, world_size),
-            self.config.mamba_d_head,
-            self.config.mamba_d_state,
-        )
-        return conv_state_shape, temporal_state_shape
-
     def compute_logits(
         self,
         hidden_states: torch.Tensor,