Skip to content

Commit fb3ecfb

Browse files
Disable chunk prefill and prefix caching when model is attention free
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
1 parent 673aeb0 commit fb3ecfb

File tree

3 files changed

+11
-2
lines changed

3 files changed

+11
-2
lines changed

vllm/config.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4710,6 +4710,15 @@ def __post_init__(self):
47104710
"Only \"last\" pooling supports chunked "
47114711
"prefill and prefix caching; disabling both.")
47124712

4713+
if self.model_config.is_attention_free:
4714+
# If the model is not of pooling type and it is attention free,
4715+
# we make sure chunked prefill and prefix_caching are
4716+
# disabled so that the correct KVCacheCoordinator
4717+
# is loaded.
4718+
disable_chunked_prefill_reasons.append(
4719+
"This is an attention free model, "
4720+
"disabling chunked prefill and prefix caching.")
4721+
47134722
if disable_chunked_prefill_reasons:
47144723
for reason in disable_chunked_prefill_reasons:
47154724
logger.info(reason)

vllm/v1/core/kv_cache_coordinator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,7 @@ def get_kv_cache_coordinator(
388388
kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool,
389389
enable_caching: bool, caching_hash_fn: Callable,
390390
enable_kv_cache_events: bool) -> KVCacheCoordinator:
391-
if not enable_caching or len(kv_cache_config.kv_cache_groups) == 0:
391+
if not enable_caching:
392392
# We instantiate this coordinator also for attention free models that
393393
# have 0 kv_cache_groups
394394
return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len,

vllm/v1/core/kv_cache_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def __init__(
8989
self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
9090

9191
self.block_size: Optional[int] = None
92-
if self.enable_caching and len(kv_cache_config.kv_cache_groups) > 0:
92+
if self.enable_caching:
9393
assert len(
9494
set(g.kv_cache_spec.block_size
9595
for g in kv_cache_config.kv_cache_groups)

0 commit comments

Comments
 (0)