Support for attention free models in V1

christian-pinto · christian-pinto · commit bec56282e199 · 2025-06-24T10:43:08.000Z
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -63,6 +63,53 @@ def new_empty(self) -> "KVCacheBlocks":
         """Creates a new KVCacheBlocks instance with no blocks."""
         return KVCacheBlocks(tuple([] for _ in range(len(self.blocks))))
 
+class DummyKVCacheManager:
+    @property
+    def usage(self) -> float:
+        return 0.0
+
+    def make_prefix_cache_stats(self) -> Optional[PrefixCacheStats]:
+        return None
+
+    def get_computed_blocks(self,
+                            request: Request) -> tuple[KVCacheBlocks, int]:
+        return(KVCacheBlocks([]), 0)
+
+    def allocate_slots(
+        self,
+        request: Request,
+        num_new_tokens: int,
+        num_new_computed_tokens: int = 0,
+        new_computed_blocks: Optional[KVCacheBlocks] = None,
+        num_draft_tokens: int = 0,
+        num_lookahead_tokens: int = 0,
+        delay_cache_blocks: bool = False,
+    ) -> Optional[KVCacheBlocks]:
+        #if we do not return a KV cache block requests are unschedulable
+        return KVCacheBlocks([KVCacheBlock(block_id=0)])
+
+    def free(self, request: Request) -> None:
+        pass
+
+    def reset_prefix_cache(self) -> bool:
+        return True
+
+    def get_num_common_prefix_blocks(
+        self,
+        request: Request,
+        num_running_requests: int,
+    ) -> list[int]:
+        return []
+
+    def free_block_hashes(self, request: Request) -> None:
+        pass
+
+    def take_events(self) -> list[KVCacheEvent]:
+        return []
+
+    def get_block_ids(self, request_id: str) -> list[list[int]]:
+        """Get the block ids of a request."""
+        return []
 
 class KVCacheManager:
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -18,7 +18,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
-from vllm.v1.core.kv_cache_manager import KVCacheManager
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager, DummyKVCacheManager
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
@@ -90,7 +90,8 @@ def __init__(
         )
 
         num_gpu_blocks = self.cache_config.num_gpu_blocks
-        assert num_gpu_blocks is not None and num_gpu_blocks > 0
+        # num_gpu_blocks can be ero for attention free models
+        assert num_gpu_blocks is not None
 
         self.block_size = self.cache_config.block_size
 
@@ -155,15 +156,18 @@ def __init__(
                 self.num_lookahead_tokens = self.num_spec_tokens
 
         # Create the KV cache manager.
-        self.kv_cache_manager = KVCacheManager(
-            kv_cache_config=kv_cache_config,
-            max_model_len=self.max_model_len,
-            enable_caching=self.cache_config.enable_prefix_caching,
-            caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
-            use_eagle=self.use_eagle,
-            log_stats=self.log_stats,
-            enable_kv_cache_events=self.enable_kv_cache_events,
-        )
+        if self.cache_config.is_attention_free:
+            self.kv_cache_manager = DummyKVCacheManager()
+        else:
+            self.kv_cache_manager = KVCacheManager(
+                kv_cache_config=kv_cache_config,
+                max_model_len=self.max_model_len,
+                enable_caching=self.cache_config.enable_prefix_caching,
+                caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
+                use_eagle=self.use_eagle,
+                log_stats=self.log_stats,
+                enable_kv_cache_events=self.enable_kv_cache_events,
+            )
 
     def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -134,26 +134,34 @@ def _initialize_kv_caches(
             self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
         start = time.time()
 
-        # Get all kv cache needed by the model
-        kv_cache_specs = self.model_executor.get_kv_cache_specs()
-
-        # Profiles the peak memory usage of the model to determine how much
-        # memory can be allocated for kv cache.
-        available_gpu_memory = self.model_executor.determine_available_memory()
-
-        assert len(kv_cache_specs) == len(available_gpu_memory)
-        # Get the kv cache tensor size
-        kv_cache_configs = [
-            get_kv_cache_config(vllm_config, kv_cache_spec_one_worker,
-                                available_gpu_memory_one_worker)
-            for kv_cache_spec_one_worker, available_gpu_memory_one_worker in
-            zip(kv_cache_specs, available_gpu_memory)
-        ]
-
-        # Since we use a shared centralized controller, we need the
-        # `kv_cache_config` to be consistent across all workers to make sure
-        # all the memory operators can be applied to all workers.
-        unify_kv_cache_configs(kv_cache_configs)
+        if vllm_config.model_config.is_attention_free:
+            # No need for initializing anything related to KV cache if the model
+            # is attention free.
+            kv_cache_specs = []
+            kv_cache_configs = [
+                    KVCacheConfig(num_blocks=0, tensors={}, kv_cache_groups=[])
+                ]
+        else:
+            # Get all kv cache needed by the model
+            kv_cache_specs = self.model_executor.get_kv_cache_specs()
+
+            # Profiles the peak memory usage of the model to determine how much
+            # memory can be allocated for kv cache.
+            available_gpu_memory = self.model_executor.determine_available_memory()
+
+            assert len(kv_cache_specs) == len(available_gpu_memory)
+            # Get the kv cache tensor size
+            kv_cache_configs = [
+                get_kv_cache_config(vllm_config, kv_cache_spec_one_worker,
+                                    available_gpu_memory_one_worker)
+                for kv_cache_spec_one_worker, available_gpu_memory_one_worker in
+                zip(kv_cache_specs, available_gpu_memory)
+            ]
+
+            # Since we use a shared centralized controller, we need the
+            # `kv_cache_config` to be consistent across all workers to make sure
+            # all the memory operators can be applied to all workers.
+            unify_kv_cache_configs(kv_cache_configs)
 
         # All workers have the same kv_cache_config except layer names, so use
         # an arbitrary one to initialize the scheduler.
@@ -186,6 +194,7 @@ def add_request(self, request: EngineCoreRequest):
             request.mm_inputs = self.mm_input_cache_server.get_and_update_p1(
                 request.mm_inputs, request.mm_hashes)
 
+
         req = Request.from_engine_core_request(request)
         if req.use_structured_output:
             # Start grammar compilation asynchronously