Revert other changs; update docs

tdoublep · tdoublep · commit bcd9376aae08 · 2025-07-09T07:14:18.000Z
Signed-off-by: Thomas Parnell &lt;tpa@zurich.ibm.com&gt;
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
@@ -112,7 +112,8 @@ enforcing eager mode and disabling prefix caching in V1.
 Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
 `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`). Please note that
 these models currently require enforcing eager mode, disabling prefix caching, and using the FlashInfer attention
-backend in V1.
+backend in V1. It is also necessary to pass a non-standard block size for attention layers (this is not possible
+using the `vllm serve` CLI yet).
 
 #### Encoder-Decoder Models
 
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
@@ -61,6 +61,14 @@
     "tiiuae/Falcon-H1-0.5B-Base",
 ]
 
+ATTN_BLOCK_SIZES = {
+    "ibm-ai-platform/Bamba-9B-v1": 528,
+    "Zyphra/Zamba2-1.2B-instruct": 80,
+    "nvidia/Nemotron-H-8B-Base-8K": 528,
+    "ibm-granite/granite-4.0-tiny-preview": 400,
+    "tiiuae/Falcon-H1-0.5B-Base": 800,
+}
+
 # Avoid OOM
 MAX_NUM_SEQS = 4
 
@@ -97,6 +105,11 @@ def test_models(
             example_prompts, max_tokens, num_logprobs)
 
     if model in V1_SUPPORTED_MODELS:
+        if model in HYBRID_MODELS and model in ATTN_BLOCK_SIZES:
+            block_size = ATTN_BLOCK_SIZES[model]
+        else:
+            block_size = 16
+
         with monkeypatch.context() as m:
             m.setenv("VLLM_USE_V1", "1")
             if model in HYBRID_MODELS:
@@ -105,7 +118,8 @@ def test_models(
             with vllm_runner(model,
                              max_num_seqs=MAX_NUM_SEQS,
                              enforce_eager=True,
-                             enable_prefix_caching=False) as vllm_model:
+                             enable_prefix_caching=False,
+                             block_size=block_size) as vllm_model:
                 vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
                     example_prompts, max_tokens, num_logprobs)
     else:
diff --git a/vllm/config.py b/vllm/config.py
@@ -1553,9 +1553,6 @@ class CacheConfig:
     checkpoint if available. Otherwise, the scales will default to 1.0."""
     cpu_kvcache_space_bytes: Optional[int] = None
     """(CPU backend only) CPU key-value cache space."""
-    mamba_page_size_padded: Optional[int] = None
-    """ Optional override for mamba page size; used by hybrid mamaba/attention
-    models to ensure exact alignment with attention page size."""
 
     # Will be set after profiling.
     num_gpu_blocks: Optional[int] = field(default=None, init=False)
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
@@ -1,18 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from copy import deepcopy
-from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
-import vllm.envs as envs
-from vllm.distributed import divide
 from vllm.logger import init_logger
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
-from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
 
 if TYPE_CHECKING:
-    from transformers.configuration_utils import PretrainedConfig
-
     from vllm.config import VllmConfig
 
 logger = init_logger(__name__)
@@ -198,197 +191,10 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         }
 
 
-class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
-
-    @classmethod
-    def extra_groups_for_head_shards(cls, ngroups: int, tp_size: int) -> int:
-        """Compute the increase in group numbers to account for
-        replication in order to accompany the head shards."""
-
-        # in the case ngoups % tp_size == 0, this will be zero
-        if ngroups % tp_size == 0:
-            return 0
-
-        # for n_groups == 1, this is exactly tp_size - n_groups
-        return tp_size - ngroups
-
-    @dataclass
-    class MambaConfig:
-        expand: int
-        n_groups: int
-        n_heads: int
-        d_head: int
-        d_state: int
-        d_conv: int
-
-    @classmethod
-    def parse_mamba_config(cls, config: "PretrainedConfig") -> MambaConfig:
-        return cls.MambaConfig(
-            expand=config.mamba_expand,
-            n_groups=config.mamba_n_groups,
-            n_heads=config.mamba_n_heads,
-            d_head=config.mamba_d_head,
-            d_state=config.mamba_d_state,
-            d_conv=config.mamba_d_conv,
-        )
-
-    @classmethod
-    def get_mamba_cache_shape(
-            cls, vllm_config: "VllmConfig"
-    ) -> tuple[tuple[int, int], tuple[int, int]]:
-
-        parallel_config = vllm_config.parallel_config
-        hf_config = vllm_config.model_config.hf_config
-        mamba_config = cls.parse_mamba_config(hf_config)
-
-        world_size = parallel_config.tensor_parallel_size
-        hidden_size = hf_config.hidden_size
-        intermediate_size = mamba_config.expand * hidden_size
-
-        # if n_groups is not divisible by world_size, need to extend the shards
-        # to ensure all groups needed by a head is sharded along with it
-        n_groups = (mamba_config.n_groups + cls.extra_groups_for_head_shards(
-            mamba_config.n_groups, world_size))
-
-        # - heads and n_groups are TP-ed
-        conv_dim = (intermediate_size + 2 * n_groups * mamba_config.d_state)
-        conv_state_shape = (
-            divide(conv_dim, world_size),
-            mamba_config.d_conv - 1,
-        )
-
-        # These are not TP-ed as they depend on A, dt_bias, D
-        # - they are typically small
-        #   e.g., (h_heads, d_head, d_state) = (128, 64, 128)
-        temporal_state_shape = (
-            divide(mamba_config.n_heads, world_size),
-            mamba_config.d_head,
-            mamba_config.d_state,
-        )
-
-        return conv_state_shape, temporal_state_shape
-
-    @classmethod
-    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
-        """
-        Ensure that page size of attention layers is greater than or
-        equal to the mamba layers. If not, automatically set the attention
-        block size to ensure that it is. If the attention page size is
-        strictly greater than the mamba page size, we pad the mamba page size
-        to make them equal.
-
-        Args:
-            vllm_config: vLLM Config
-        """
-
-        if not envs.VLLM_USE_V1:
-            return
-
-        cache_config = vllm_config.cache_config
-        model_config = vllm_config.model_config
-        parallel_config = vllm_config.parallel_config
-
-        if cache_config.cache_dtype == "auto":
-            kv_cache_dtype = model_config.dtype
-        else:
-            kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
-
-        # get attention page size (for 1 token)
-        attn_page_size_1_token = FullAttentionSpec(
-            block_size=1,
-            num_kv_heads=model_config.get_num_kv_heads(parallel_config),
-            head_size=model_config.get_head_size(),
-            dtype=kv_cache_dtype,
-            use_mla=model_config.use_mla).page_size_bytes
-
-        # get mamba page size
-        mamba_page_size = MambaSpec(
-            shapes=cls.get_mamba_cache_shape(vllm_config),
-            dtype=kv_cache_dtype,
-            block_size=model_config.max_model_len,
-        ).page_size_bytes
-
-        # some attention backends (e.g. FA) only support setting
-        # block size to multiple of 16, so let's suggest a value
-        # that would work (note: FA is currently not compatible
-        # with mamba layers, use FlashInfer instead).
-        attn_block_size = 16 * cdiv(mamba_page_size,
-                                    16 * attn_page_size_1_token)
-
-        # override attention block size if either (a) the
-        # user has not set it or (b) the user has set it
-        # too small.
-        if (cache_config.block_size is None
-                or cache_config.block_size < attn_block_size):
-            cache_config.block_size = attn_block_size
-            logger.info(
-                "Setting attention block size to %d tokens "
-                "to ensure that attention page size is >= mamba page size.",
-                attn_block_size)
-
-        # compute new attention page size
-        attn_page_size = \
-            cache_config.block_size * attn_page_size_1_token
-
-        assert attn_page_size >= mamba_page_size
-
-        if attn_page_size == mamba_page_size:
-            # don't need to pad mamba page size
-            return
-
-        # pad mamba page size to exactly match attention
-        if (cache_config.mamba_page_size_padded is None
-                or cache_config.mamba_page_size_padded != attn_page_size):
-            cache_config.mamba_page_size_padded = (attn_page_size)
-            mamba_padding_pct = 100 * (attn_page_size -
-                                       mamba_page_size) / mamba_page_size
-            logger.info(
-                "Padding mamba page size by %.2f%% to ensure "
-                "that mamba page size and attention page size are "
-                "exactly equal.", mamba_padding_pct)
-
-
-class NemotronHModelConfig(HybridAttentionMambaModelConfig):
-
-    @classmethod
-    def parse_mamba_config(
-        cls, config: "PretrainedConfig"
-    ) -> HybridAttentionMambaModelConfig.MambaConfig:
-        return HybridAttentionMambaModelConfig.MambaConfig(
-            expand=config.expand,
-            n_groups=config.n_groups,
-            n_heads=config.mamba_num_heads,
-            d_head=config.mamba_head_dim,
-            d_state=config.ssm_state_size,
-            d_conv=config.conv_kernel,
-        )
-
-
-class Zamba2ModelConfig(HybridAttentionMambaModelConfig):
-
-    @classmethod
-    def parse_mamba_config(
-        cls, config: "PretrainedConfig"
-    ) -> HybridAttentionMambaModelConfig.MambaConfig:
-        return HybridAttentionMambaModelConfig.MambaConfig(
-            expand=config.mamba_expand,
-            n_groups=config.mamba_ngroups,
-            n_heads=config.n_mamba_heads,
-            d_head=config.mamba_headdim,
-            d_state=config.mamba_d_state,
-            d_conv=config.mamba_d_conv,
-        )
-
-
 MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "GteModel": SnowflakeGteNewModelConfig,
     "GteNewModel": GteNewModelConfig,
     "NomicBertModel": NomicBertModelConfig,
     "Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig,
     "XLMRobertaModel": JinaRobertaModelConfig,
-    "FalconH1ForCausalLM": HybridAttentionMambaModelConfig,
-    "BambaForCausalLM": HybridAttentionMambaModelConfig,
-    "GraniteMoeHybridForCausalLM": HybridAttentionMambaModelConfig,
-    "NemotronHForCausalLM": NemotronHModelConfig,
-    "Zamba2ForCausalLM": Zamba2ModelConfig,
 }
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -43,7 +43,7 @@
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        GiB_bytes, LazyLoader, async_tensor_h2d,
+                        GiB_bytes, LazyLoader, async_tensor_h2d, cdiv,
                         check_use_alibi, get_dtype_size,
                         is_pin_memory_available, round_up)
 from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend
@@ -2675,8 +2675,9 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
                     "Prefix caching is not supported for Mamba yet.")
             max_model_len = self.vllm_config.model_config.max_model_len
 
-            page_size_padded = (
-                self.vllm_config.cache_config.mamba_page_size_padded)
+            page_size_padded = self._maybe_pad_mamba_page_size(
+                attn_layers, mamba_layers, kv_cache_spec, max_model_len,
+                block_size)
 
             # Set block_size to max_model_len, so that mamba model will always
             # have only one block in the KV cache.
@@ -2688,3 +2689,54 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
                     page_size_padded=page_size_padded)
 
         return kv_cache_spec
+
+    def _maybe_pad_mamba_page_size(
+        self,
+        attn_layers: dict[str, Attention],
+        mamba_layers: dict[str, MambaMixer2],
+        kv_cache_spec: dict[str, KVCacheSpec],
+        max_model_len: int,
+        block_size: int,
+    ) -> Optional[int]:
+        """
+        Ensure that page size of attention KV cache groups is greater than or
+        equal to the mamba KV cache groups. If not, we suggest to the user
+        how to set the attention block size to ensure that it is.
+
+        If the attention page size is strictly greater than the mamba page size,
+        we pad the mamba page size to make them equal.
+
+        Args:
+            attn_layers: Attention layers
+            mamba_layers: Mamba layers
+            kv_cache_spec: KV cache spec (populated with attention layers)
+
+        Returns:
+            Optional[int]: Mamba page size with padding (None if no padding).
+        """
+
+        if len(attn_layers) == 0:
+            return None
+
+        attn_layer_name = next(iter(attn_layers))
+        attn_page_size = kv_cache_spec[attn_layer_name].page_size_bytes
+        mamba_layer_name = next(iter(mamba_layers))
+        mamba_page_size = MambaSpec(
+            shapes=mamba_layers[mamba_layer_name].get_state_shape(),
+            dtype=self.kv_cache_dtype,
+            block_size=max_model_len).page_size_bytes
+        if attn_page_size < mamba_page_size:
+            # attention page size (for 16 tokens)
+            attn_page_size_16 = 16 * attn_page_size // block_size
+            # some attention backends (e.g. FA) only support setting
+            # block size to multiple of 16, so let's suggest a value
+            # that would work (note: FA is currently not compatible
+            # with mamba layers, use FlashInfer instead).
+            suggest_attn_block_size = 16 * cdiv(mamba_page_size,
+                                                attn_page_size_16)
+            raise ValueError(
+                "Attention block size should be increased to at least "
+                f"{suggest_attn_block_size} in order to match "
+                "the mamba page size")
+
+        return attn_page_size