vllm-project
diff --git a/‎.github/workflows/vllm_ascend_test.yaml
Lines changed: 6 additions & 3 deletions b/‎.github/workflows/vllm_ascend_test.yaml
Lines changed: 6 additions & 3 deletions
diff --git a/‎tests/singlecard/spec_decode/e2e/test_v1_mtp_correctness.py
Lines changed: 95 additions & 0 deletions b/‎tests/singlecard/spec_decode/e2e/test_v1_mtp_correctness.py
Lines changed: 95 additions & 0 deletions
diff --git a/‎vllm_ascend/attention/mla_v1.py
Lines changed: 44 additions & 9 deletions b/‎vllm_ascend/attention/mla_v1.py
Lines changed: 44 additions & 9 deletions
diff --git a/‎vllm_ascend/ops/attention.py
Lines changed: 13 additions & 2 deletions b/‎vllm_ascend/ops/attention.py
Lines changed: 13 additions & 2 deletions
diff --git a/‎vllm_ascend/patch/platform/patch_main/__init__.py
Lines changed: 3 additions & 0 deletions b/‎vllm_ascend/patch/platform/patch_main/__init__.py
Lines changed: 3 additions & 0 deletions
@@ -48,14 +48,14 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-arm64-npu-1, linux-arm64-npu-4]
-        vllm_verison: [main, v0.8.5.post1]
+        vllm_version: [main, v0.8.5.post1]
     concurrency:
       group: >
         ${{
         matrix.os == 'linux-arm64-npu-4'
           && github.event.pull_request.number
           && format('pr-{0}-limit-npu-4', github.event.pull_request.number)
-        || format('job-{0}-{1}-{2}', matrix.os, matrix.vllm_verison, github.event.pull_request.number)
+        || format('job-{0}-{1}-{2}', matrix.os, matrix.vllm_version, github.event.pull_request.number)
         }}
       cancel-in-progress: false
     name: vLLM Ascend test
@@ -92,7 +92,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           repository: vllm-project/vllm
-          ref: ${{ matrix.vllm_verison }}
+          ref: ${{ matrix.vllm_version }}
           path: ./vllm-empty
 
       - name: Install vllm-project/vllm from source
@@ -121,6 +121,9 @@ jobs:
             pytest -sv tests/ops
             pytest -sv tests/compile
           fi
+          if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]] && [[ "${{ matrix.vllm_version }}" == "main" ]]; then
+            pytest -sv tests/singlecard/spec_decode/e2e/test_v1_mtp_correctness.py
+          fi
 
       - name: Run vllm-project/vllm-ascend test on V0 engine
         env:
 
@@ -0,0 +1,95 @@
+from __future__ import annotations
+
+import os
+import random
+from typing import Any
+
+import pytest
+from vllm import LLM, SamplingParams
+
+os.environ['VLLM_USE_MODELSCOPE'] = 'True'
+
+
+@pytest.fixture
+def test_prompts():
+    prompt_types = ["repeat", "sentence"]
+    num_prompts = 10
+    prompts = []
+
+    random.seed(0)
+    random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
+
+    # Generate a mixed batch of prompts, some of which can be easily
+    # predicted by n-gram matching and some which likely cannot.
+    for kind in random_prompt_type_choices:
+        word_choices = ["test", "temp", "hello", "where"]
+        word = random.choice(word_choices)
+        if kind == "repeat":
+            prompt = f"""
+            please repeat the word '{word}' 10 times.
+            give no other output than the word at least ten times in a row,
+            in lowercase with spaces between each word and without quotes.
+            """
+        elif kind == "sentence":
+            prompt = f"""
+            please give a ten-word sentence that
+            uses the word {word} at least once.
+            give no other output than that simple sentence without quotes.
+            """
+        else:
+            raise ValueError(f"Unknown prompt type: {kind}")
+        prompts.append([{"role": "user", "content": prompt}])
+
+    return prompts
+
+
+@pytest.fixture
+def sampling_config():
+    return SamplingParams(temperature=0, max_tokens=256, ignore_eos=False)
+
+
+@pytest.fixture
+def model_name():
+    return "wemaster/deepseek_mtp_main_random_bf16"
+
+
+def test_mtp_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_name: str,
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using mtp speculative decoding.
+    '''
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        ref_llm = LLM(model=model_name, max_model_len=256)
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+
+        spec_llm = LLM(model=model_name,
+                       trust_remote_code=True,
+                       speculative_config={
+                           "method": "mtp",
+                           "num_speculative_tokens": 1,
+                       },
+                       max_model_len=256,
+                       enforce_eager=False)
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 66% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        assert matches > int(0.66 * len(ref_outputs))
+        del spec_llm
@@ -13,15 +13,20 @@
                                                LinearBase, RowParallelLinear,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
+from vllm_ascend.utils import vllm_version_is
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.worker.gpu_input_batch import InputBatch
 
+if vllm_version_is("main"):
+    from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+
 
 class AscendMLABackend(AttentionBackend):
 
@@ -57,6 +62,7 @@ class AscendMLAPrefillMetadata:
     seq_lens: list[int]
     context_lens: torch.Tensor
     input_positions: torch.Tensor
+    query_start_loc: torch.Tensor
     block_table: torch.Tensor
     max_query_len: int
     max_seq_lens: int
@@ -90,6 +96,9 @@ class AscendMLAMetadata:
 
     num_actual_tokens: int  # Number of tokens excluding padding.
     slot_mapping: torch.Tensor
+    query_start_loc: torch.Tensor
+    seq_lens: torch.Tensor
+    block_tables: torch.Tensor
 
     # New for MLA (compared to FlashAttention)
     # For handling prefill decode split
@@ -231,6 +240,7 @@ def build(self,
               num_actual_tokens: int,
               max_query_len: int,
               common_prefix_len: Optional[int] = None,
+              common_attn_metadata: CommonAttentionMetadata = None,
               graph_pad_size: int = -1) -> AscendMLAMetadata:
         assert self._num_decodes + self._num_prefills == num_reqs
 
@@ -245,6 +255,7 @@ def build(self,
         input_positions = self.runner.positions_cpu[:num_actual_tokens].to(
             device, non_blocking=True).long()
 
+        query_start_loc = common_attn_metadata.query_start_loc
         seq_lens_cpu = self.runner.seq_lens_cpu[:num_reqs]
         query_lens = seq_lens_cpu - self.runner.input_batch.num_computed_tokens_cpu_tensor[:
                                                                                            num_reqs]
@@ -258,6 +269,8 @@ def build(self,
             tokens_start = self._num_decode_tokens
             max_query_len = query_lens[tokens_start:].max().item()
             max_seq_lens = seq_lens[tokens_start:].max().item()
+            prefill_query_start_loc = query_start_loc[
+                reqs_start:] - query_start_loc[reqs_start]
 
             prefill_metadata = AscendMLAPrefillMetadata(
                 attn_mask=self.runner.attn_mask,
@@ -268,6 +281,7 @@ def build(self,
                 block_table=block_table[reqs_start:, ...],
                 max_query_len=max_query_len,
                 max_seq_lens=max_seq_lens,
+                query_start_loc=prefill_query_start_loc,
             )
 
         decode_metadata = None
@@ -324,6 +338,9 @@ def build(self,
             attn_state=self.runner.attn_state,
             prefill=prefill_metadata,
             decode=decode_metadata,
+            query_start_loc=query_start_loc,
+            block_tables=block_table,
+            seq_lens=seq_lens,
         )
 
 
@@ -373,6 +390,12 @@ def __init__(
         self.qk_rope_head_dim = qk_rope_head_dim
         self.qk_head_dim = qk_head_dim
         self.v_head_dim = v_head_dim
+        # TODO: below padding should be removed after kernel is ready
+        # we found npu_flash_attention can only works on 128 divisible head_dim, we pad it to target size here
+        # and slice the final result to guarantee its functionality.
+        self.padding_head_dim = (
+            (self.qk_nope_head_dim + self.qk_rope_head_dim - 1) // 128 +
+            1) * 128
 
         # Hack for V1 for now to avoid torch library overhead (since we are
         # already inside an attention custom op), pull out the forward
@@ -470,11 +493,9 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
             [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
 
         # Convert from (L, N, V) to (N, L, V)
-        self.W_UV = W_UV.transpose(0, 1).contiguous()
+        self.W_UV = W_UV.transpose(0, 1)
         # Convert from (L, N, P) to (N, P, L)
-        self.W_UK_T = W_UK.permute(1, 2, 0).contiguous()
-        self.W_UV.data = torch_npu.npu_format_cast(self.W_UV.data, 29)
-        self.W_UK_T.data = torch_npu.npu_format_cast(self.W_UK_T.data, 29)
+        self.W_UK_T = W_UK.permute(1, 2, 0)
 
     def _forward_prefill(
         self,
@@ -514,7 +535,7 @@ def _forward_prefill(
         elif attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
             attn_output = torch.empty(num_tokens,
                                       self.num_heads,
-                                      self.v_head_dim,
+                                      self.padding_head_dim,
                                       dtype=query.dtype,
                                       device=query.device)
             k_nope, value = self.kv_b_proj(kv_c_normed)[0].view(
@@ -523,17 +544,31 @@ def _forward_prefill(
                     [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
             key = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
                             dim=-1)
+            pad_query = torch.nn.functional.pad(query, [
+                0, self.padding_head_dim - self.qk_rope_head_dim -
+                self.qk_nope_head_dim
+            ],
+                                                value=0)
+            pad_key = torch.nn.functional.pad(key, [
+                0, self.padding_head_dim - self.qk_rope_head_dim -
+                self.qk_nope_head_dim
+            ],
+                                              value=0)
+            pad_value = torch.nn.functional.pad(
+                value, [0, self.padding_head_dim - self.v_head_dim], value=0)
             torch_npu._npu_flash_attention(
-                query=query,
-                key=key,
-                value=value,
+                query=pad_query,
+                key=pad_key,
+                value=pad_value,
                 mask=attn_metadata.attn_mask,
                 seq_len=attn_metadata.prefill.context_lens,
                 scale_value=self.scale,
                 num_heads=self.num_heads,
                 num_kv_heads=self.num_heads,
                 out=attn_output)
-            attn_output = attn_output.view(-1, self.num_heads, self.v_head_dim)
+            attn_output = attn_output.view(
+                -1, self.num_heads,
+                self.padding_head_dim)[:, :, :self.v_head_dim]
         else:
             raise RuntimeError(
                 "Unexpected path reached, AscendMLAImpl should only have PrefillNoCache and ChunkedPrefill scenario in forward prefill, please file a bug to vllm-ascend !"
 
@@ -222,6 +222,17 @@ def vanilla_chunked_prefill_mla(
         device="npu",
         dtype=value.dtype,
     )
+    num_query = torch.sum(q_mask).item()
+    num_add_query = num_query - query.size(0)
+    # mtp will come in
+    if num_add_query != 0:
+        add_query_size = query.size()
+        add_query_size = list(add_query_size)
+        add_query_size[0] = num_add_query
+        pad_tensor = torch.zeros(add_query_size,
+                                 dtype=query.dtype,
+                                 device=query.device)
+        query = torch.cat([query, pad_tensor], dim=0)
     pad_q[q_mask] = query
     pad_k[kv_c_mask] = key[kv_c_mask]
     pad_v[kv_c_mask] = value[kv_c_mask]
@@ -247,8 +258,8 @@ def vanilla_chunked_prefill_mla(
 
     attn_output = (attn_output[q_mask].view([-1, num_heads,
                                              v_head_dim]).to(output.dtype))
-    output = output.view_as(attn_output)
-    output.copy_(attn_output)
+    output = output.view([-1, num_heads, v_head_dim])
+    output.copy_(attn_output[:query.size(0) - num_add_query])
     return attn_output
 
 
 
@@ -14,3 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+import vllm_ascend.patch.platform.patch_main.patch_arg_utils # noqa: F401
+import vllm_ascend.patch.platform.patch_main.patch_config # noqa: F401
Original file line number	Diff line number	Diff line change
`@@ -14,3 +14,6 @@`
`14`	`14`	`# See the License for the specific language governing permissions and`
`15`	`15`	`# limitations under the License.`
`16`	`16`	`#`
	`17`	`+`
	`18`	`+import vllm_ascend.patch.platform.patch_main.patch_arg_utils # noqa: F401`
	`19`	`+import vllm_ascend.patch.platform.patch_main.patch_config # noqa: F401`