[0.7.3][V1] Support the feature of prefix cache in v1 (#559)

rjg-lyh · web-flow · commit fa116eba8477 · 2025-04-21T08:50:34.000+08:00
### What this PR does / why we need it?
Support the feature of prefix cache in v1.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Comprehensive unit tests for ops accuracy have been performed and will
be included in another PR.

Signed-off-by: rjg-lyh &lt;1318825571@qq.com&gt;
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -85,9 +85,10 @@ def copy_blocks(
 
 
 class AscendAttentionState(Enum):
-    PrefillOnly = 0
-    DecodeOnly = 1
-    ChunkedPrefill = 2
+    PrefillNoCache = 0
+    PrefillCacheHit = 1
+    DecodeOnly = 2
+    ChunkedPrefill = 3
 
 
 @dataclass
@@ -214,7 +215,7 @@ def forward(
             # TODO: Add attr (num_prefills, prefill_metadata, decode_metadata) to AscendMetadata
             pass
         # V0-Style scheduler situation.
-        elif attn_metadata.attn_state == AscendAttentionState.PrefillOnly:
+        elif attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
             assert attn_metadata is not None
             assert attn_metadata.attn_mask is not None
             mask = attn_metadata.attn_mask
@@ -227,16 +228,31 @@ def forward(
                                            num_heads=self.num_heads,
                                            num_kv_heads=self.num_kv_heads,
                                            out=output)
+        elif attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
+            assert attn_metadata is not None
+            assert attn_metadata.attn_mask is not None
+            compress_mask = attn_metadata.attn_mask
+            torch_npu._npu_flash_attention_qlens(
+                query=query,
+                key_cache=self.key_cache,
+                value_cache=self.value_cache,
+                block_table=attn_metadata.block_tables,
+                mask=compress_mask,
+                seq_len=attn_metadata.seq_lens,
+                context_lens=attn_metadata.context_lens,
+                num_kv_heads=self.num_kv_heads,
+                num_heads=self.num_heads,
+                scale_value=self.scale,
+                out=output)
         elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
-            block_tables = attn_metadata.block_tables
             torch_npu._npu_paged_attention(
                 query=query,
                 key_cache=self.key_cache,
                 value_cache=self.value_cache,
                 num_kv_heads=self.num_kv_heads,
                 num_heads=self.num_heads,
                 scale_value=self.scale,
-                block_table=block_tables,
+                block_table=attn_metadata.block_tables,
                 context_lens=attn_metadata.context_lens,
                 out=output)
         # Normal V1 situation.
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -138,9 +138,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         if envs.VLLM_USE_V1 and cache_config and cache_config.enable_prefix_caching:
             logger.warning(
-                "Prefix caching is not supported for V1 now, disable prefix caching"
+                "Prefix caching is now supported for V1 on NPU, "
+                "but it is still experimental and there may be issues with accuracy."
             )
-            cache_config.enable_prefix_caching = False
 
         if envs.VLLM_USE_V1:
             # Activate custom ops for v1.
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -71,6 +71,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         self.speculative_config = vllm_config.speculative_config
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
+        self.chunked_prefill_enabled = vllm_config.scheduler_config.chunked_prefill_enabled
 
         model_config = self.model_config
         cache_config = self.cache_config
@@ -419,11 +420,15 @@ def make_attention_mask(self, seq_lens, query_lens, position,
         if attn_state == AscendAttentionState.ChunkedPrefill:
             return self.attn_mask_builder.get_splitfuse_attn_mask(
                 seq_lens, query_lens, position, self.dtype, self.device)
-        # Prefill-only situation.
-        elif attn_state == AscendAttentionState.PrefillOnly:
+        # Prefill without cache situation.
+        elif attn_state == AscendAttentionState.PrefillNoCache:
             max_seq_len = max(seq_lens, default=0)
             return self.attn_mask_builder.get_attn_mask(
                 max_seq_len, self.dtype, self.device)
+        # Prefill with cache hit.
+        elif attn_state == AscendAttentionState.PrefillCacheHit:
+            return self.attn_mask_builder.get_attn_mask(
+                128, self.dtype, self.device)
         # Decode-only situation.
         else:
             return None
@@ -492,13 +497,15 @@ def _process_reqs(
         slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to(
             self.device, non_blocking=True)
 
-        attn_state = AscendAttentionState.ChunkedPrefill
-        if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
-            attn_state = AscendAttentionState.PrefillOnly
+        if self.chunked_prefill_enabled:
+            attn_state = AscendAttentionState.ChunkedPrefill
+        elif np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
+            attn_state = AscendAttentionState.PrefillNoCache
+        # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
         elif np.all(num_scheduled_tokens == 1):
             attn_state = AscendAttentionState.DecodeOnly
         else:
-            attn_state = AscendAttentionState.ChunkedPrefill
+            attn_state = AscendAttentionState.PrefillCacheHit
 
         attn_mask = self.make_attention_mask(seq_lens=seq_lens,
                                              query_lens=num_scheduled_tokens,