Support for attention free models revisited to reuse existing KVCache maanger.

christian-pinto · christian-pinto · commit e0dd56aa5303 · 2025-07-07T14:46:11.000Z
Signed-off-by: Christian Pinto &lt;christian.pinto@ibm.com&gt;
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
@@ -36,7 +36,8 @@ def __init__(
         enable_caching: bool,
         enable_kv_cache_events: bool = False,
     ):
-        assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
+        # num_gpu_blocks can be 0 for attention free models
+        assert isinstance(num_gpu_blocks, int)
         self.num_gpu_blocks = num_gpu_blocks
         self.enable_caching = enable_caching
         # All kv-cache blocks.
@@ -60,10 +61,12 @@ def __init__(
         self.cached_block_hash_to_block: dict[BlockHashWithGroupId, dict[
             int, KVCacheBlock]] = defaultdict(dict)
 
+        if not self.free_block_queue:
+            self
         # To represent a placeholder block with block_id=0.
         # The ref_cnt of null_block is not maintained, needs special care to
         # avoid freeing it.
-        self.null_block = self.free_block_queue.popleft()
+        self.null_block = self.free_block_queue.popleft() if self.free_block_queue else None
         self.null_block.is_null = True
 
         self.enable_kv_cache_events = enable_kv_cache_events
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
@@ -219,7 +219,9 @@ def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
         super().__init__(kv_cache_config, max_model_len, use_eagle,
                          enable_caching, caching_hash_fn,
                          enable_kv_cache_events)
-        self.verify_and_split_kv_cache_groups()
+        # attention free models are initialized with 0 kv_cache_groups
+        if len(self.kv_cache_config.kv_cache_groups) > 0:
+            self.verify_and_split_kv_cache_groups()
 
     def verify_and_split_kv_cache_groups(self) -> None:
         """
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
@@ -84,12 +84,17 @@ def __init__(
         self.log_stats = log_stats
         # FIXME: make prefix cache stats conditional on log_stats
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
-        assert len(
-            set(g.kv_cache_spec.block_size
-                for g in kv_cache_config.kv_cache_groups)
-        ) == 1, "Only one block size is supported for now"
-        self.block_size = kv_cache_config.kv_cache_groups[
-            0].kv_cache_spec.block_size
+
+        if len(kv_cache_config.kv_cache_groups) == 0:
+            #This is an attention free model that is started with 0 KVCache groups.
+            self.block_size = 0
+        else:
+            assert len(
+                set(g.kv_cache_spec.block_size
+                    for g in kv_cache_config.kv_cache_groups)
+            ) == 1, "Only one block size is supported for now"
+            self.block_size = kv_cache_config.kv_cache_groups[
+                0].kv_cache_spec.block_size
 
         self.coordinator = get_kv_cache_coordinator(
             kv_cache_config=kv_cache_config,
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
@@ -200,14 +200,18 @@ class FreeKVCacheBlockQueue:
     def __init__(self, blocks: list[KVCacheBlock]) -> None:
         self.num_free_blocks = len(blocks)
 
-        # Initialize the doubly linked list of free blocks.
-        self.free_list_head: Optional[KVCacheBlock] = blocks[0]
-        self.free_list_tail: Optional[KVCacheBlock] = blocks[-1]
-        for i in range(self.num_free_blocks):
-            if i > 0:
-                blocks[i].prev_free_block = blocks[i - 1]
-            if i < self.num_free_blocks - 1:
-                blocks[i].next_free_block = blocks[i + 1]
+        # This is 0 in attention free models
+        if self.num_free_blocks > 0:
+            # Initialize the doubly linked list of free blocks.
+            self.free_list_head: Optional[KVCacheBlock] = blocks[0]
+            self.free_list_tail: Optional[KVCacheBlock] = blocks[-1]
+            for i in range(self.num_free_blocks):
+                if i > 0:
+                    blocks[i].prev_free_block = blocks[i - 1]
+                if i < self.num_free_blocks - 1:
+                    blocks[i].next_free_block = blocks[i + 1]
+        else:
+            self.free_list_head = self.free_list_tail = KVCacheBlock(block_id=0, ref_cnt=0, is_null=True)
 
     def popleft(self) -> KVCacheBlock:
         """Pop the first free block and reduce num_free_blocks by 1.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -92,7 +92,9 @@ def __init__(
         )
 
         num_gpu_blocks = self.cache_config.num_gpu_blocks
-        assert num_gpu_blocks is not None and num_gpu_blocks > 0
+
+        # num_gpu_blocks can be zero for attention free models
+        assert num_gpu_blocks is not None
 
         self.block_size = self.cache_config.block_size
 
@@ -246,6 +248,12 @@ def schedule(self) -> SchedulerOutput:
                 request.num_tokens, 0)
 
             while True:
+                # This model is attention free and we do not need to allocate KVCache blocks
+                # for serving requests.
+                if self.vllm_config.model_config.is_attention_free:
+                    can_schedule = True
+                    break
+
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request,
                     num_new_tokens,
@@ -438,29 +446,29 @@ def schedule(self) -> SchedulerOutput:
                         if num_new_tokens == 0:
                             # The request cannot be scheduled.
                             break
-
-                new_blocks = self.kv_cache_manager.allocate_slots(
-                    request,
-                    num_new_tokens + num_external_computed_tokens,
-                    num_new_local_computed_tokens,
-                    new_computed_blocks,
-                    num_lookahead_tokens=self.num_lookahead_tokens,
-                    delay_cache_blocks=load_kv_async,
-                )
-                if new_blocks is None:
-                    # The request cannot be scheduled.
-                    break
-
-                # KVTransfer: the connector uses this info to determine
-                # if a load is needed. Note that
-                # This information is used to determine if a load is
-                # needed for this request.
-                if self.connector is not None:
-                    self.connector.update_state_after_alloc(
+                if not self.vllm_config.model_config.is_attention_free:
+                    new_blocks = self.kv_cache_manager.allocate_slots(
                         request,
-                        new_computed_blocks + new_blocks,
-                        num_external_computed_tokens,
+                        num_new_tokens + num_external_computed_tokens,
+                        num_new_local_computed_tokens,
+                        new_computed_blocks,
+                        num_lookahead_tokens=self.num_lookahead_tokens,
+                        delay_cache_blocks=load_kv_async,
                     )
+                    if new_blocks is None:
+                        # The request cannot be scheduled.
+                        break
+
+                    # KVTransfer: the connector uses this info to determine
+                    # if a load is needed. Note that
+                    # This information is used to determine if a load is
+                    # needed for this request.
+                    if self.connector is not None:
+                        self.connector.update_state_after_alloc(
+                            request,
+                            new_computed_blocks + new_blocks,
+                            num_external_computed_tokens,
+                        )
 
                 # Request was already popped from self.waiting
                 # unless it was re-added above due to new_blocks being None.
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
@@ -295,7 +295,9 @@ def add_request(
         self.num_tokens_no_spec[req_index] = request.num_tokens
 
         self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
-        self.block_table.add_row(request.block_ids, req_index)
+        if request.block_ids:
+            # Attention free requests have no blocks assigned
+            self.block_table.add_row(request.block_ids, req_index)
 
         if sampling_params := request.sampling_params:
             if (self.is_spec_decode