vllm-project
diff --git a/‎tests/singlecard/spec_decode/e2e/test_v1_mtp_correctness.py
Lines changed: 97 additions & 0 deletions b/‎tests/singlecard/spec_decode/e2e/test_v1_mtp_correctness.py
Lines changed: 97 additions & 0 deletions
diff --git a/‎vllm_ascend/attention/mla_v1.py
Lines changed: 3 additions & 0 deletions b/‎vllm_ascend/attention/mla_v1.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎vllm_ascend/patch/__init__.py
Lines changed: 32 additions & 0 deletions b/‎vllm_ascend/patch/__init__.py
Lines changed: 32 additions & 0 deletions
diff --git a/‎vllm_ascend/patch/platform/patch_common/__init__.py
Lines changed: 2 additions & 0 deletions b/‎vllm_ascend/patch/platform/patch_common/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎vllm_ascend/patch/platform/patch_common/pacth_config.py
Lines changed: 198 additions & 0 deletions b/‎vllm_ascend/patch/platform/patch_common/pacth_config.py
Lines changed: 198 additions & 0 deletions
@@ -0,0 +1,97 @@
+from __future__ import annotations
+
+import os
+import random
+from typing import Any
+
+import pytest
+from vllm import LLM, SamplingParams
+
+os.environ['VLLM_USE_MODELSCOPE'] = 'True'
+
+
+@pytest.fixture
+def test_prompts():
+    prompt_types = ["repeat", "sentence"]
+    num_prompts = 100
+    prompts = []
+
+    random.seed(0)
+    random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
+
+    # Generate a mixed batch of prompts, some of which can be easily
+    # predicted by n-gram matching and some which likely cannot.
+    for kind in random_prompt_type_choices:
+        word_choices = ["test", "temp", "hello", "where"]
+        word = random.choice(word_choices)
+        if kind == "repeat":
+            prompt = f"""
+            please repeat the word '{word}' 10 times.
+            give no other output than the word at least ten times in a row,
+            in lowercase with spaces between each word and without quotes.
+            """
+        elif kind == "sentence":
+            prompt = f"""
+            please give a ten-word sentence that
+            uses the word {word} at least once.
+            give no other output than that simple sentence without quotes.
+            """
+        else:
+            raise ValueError(f"Unknown prompt type: {kind}")
+        prompts.append([{"role": "user", "content": prompt}])
+
+    return prompts
+
+
+@pytest.fixture
+def sampling_config():
+    return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)
+
+
+@pytest.fixture
+def model_name():
+    return "wemaster/deepseek_mtp_main_random_bf16"
+
+
+def test_mtp_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_name: str,
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using mtp speculative decoding.
+    '''
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        ref_llm = LLM(model=model_name, max_model_len=256)
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+
+        spec_llm = LLM(model=model_name,
+                       trust_remote_code=True,
+                       speculative_config={
+                           "method": "mtp",
+                           "num_speculative_tokens": 1,
+                       },
+                       max_model_len=256,
+                       enforce_eager=False)
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            print(f"ref rst is {ref_output.outputs[0].text}")
+            print(f"mtp rst is {spec_output.outputs[0].text}")
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 66% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        assert matches > int(0.66 * len(ref_outputs))
+        del spec_llm
@@ -495,6 +495,9 @@ def _forward_prefill(
     ) -> torch.Tensor:
         assert attn_metadata.prefill is not None
 
+        # TODO Don't know why PrefillCacheHit exists after turning on mtp
+        if attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
+            attn_metadata.attn_state = AscendAttentionState.PrefillNoCache
         num_tokens = query.size(0)
         attn_output = None
         # Here is only 2 possibility of input, ChunkedPrefill or PrefillNoCache
 
@@ -85,6 +85,26 @@
 #    Future Plan:
 #       Remove those patch when vllm merged them
 #
+# ** File: platform/patch_common/patch_arg_utils.py**
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `vllm.engine.arg_utils.EngineArgs._is_v1_supported_oracle()`
+#    Why:
+#       In order to adapt to the mtp function of v1, a new patch is added.
+#    How：
+#       Add verification related to mtp function.
+#    Future Plan:
+#       Delete patch to follow the version plan.
+#
+# ** File: platform/patch_common/patch_config.py**
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `vllm.config.SpeculativeConfig.__post_init__()`
+#    Why:
+#       In order to adapt to the mtp function of v1, a new patch is added.
+#    How：
+#       Add verification related to mtp function.
+#    Future Plan:
+#       Delete patch to follow the version plan.
+#
 #
 # * Worker Patch:
 # ===============
@@ -158,4 +178,16 @@
 #       - https://github.com/vllm-project/vllm-ascend/pull/395
 #    Future Plan:
 #       Revert it when the related pr is merged in vllm and vllm-ascend.
+#
+# ** File: worker/patch_common/patch_v1_mtp_proposer.py **
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `vllm.v1.spec_decode`
+#    Why:
+#       In order to adapt to the mtp function of v1, Because vllm has not yet merged into the
+#       implementation of mtp in v1, we will implement mtp_proposer separately.
+#    How：
+#       Add verification related to mtp function.
+#    Future Plan:
+#       When vllm is merged into mtp, only the special parts will be modified.
+#       Delete patch to follow the version plan.
 #
@@ -15,4 +15,6 @@
 # limitations under the License.
 #
 
+import vllm_ascend.patch.platform.patch_common.patch_arg_utils
+import vllm_ascend.patch.platform.patch_common.patch_config
 import vllm_ascend.patch.platform.patch_common.patch_distributed  # noqa
@@ -0,0 +1,198 @@
+from typing import TYPE_CHECKING, Any, Callable, Literal, TypeVar, Union
+
+import vllm.envs as envs
+from transformers import PretrainedConfig
+
+from vllm.config import ModelConfig, SpeculativeConfig
+
+if TYPE_CHECKING:
+    from _typeshed import DataclassInstance
+
+    ConfigType = type[DataclassInstance]
+else:
+    ConfigType = type
+
+ConfigT = TypeVar("ConfigT", bound=ConfigType)
+
+TaskOption = Literal["auto", "generate", "embedding", "embed", "classify",
+                     "score", "reward", "transcription"]
+
+RunnerType = Literal["generate", "pooling", "draft", "transcription"]
+
+HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig],
+                                             PretrainedConfig]]
+
+
+def __post_init__(self):
+
+    # Note: "method" is a new parameter that helps to extend the
+    # configuration of non-model-based proposers, and the "model" parameter
+    # will be used to set the draft model, eagle head, or additional weight
+    # when needed. If users do not specify "method", the speculative method
+    # will be detected automatically if possible. If the speculative method
+    # can not be detected, it will be considered as the "draft_model" by
+    # default.
+
+    if self.model is None and self.num_speculative_tokens is not None:
+        # TODO(Shangming): Refactor mtp configuration logic when supporting
+        # mtp acceleration for more models besides deepseek_v3
+        if self.target_model_config and \
+            (self.target_model_config.hf_text_config.model_type \
+                    == "deepseek_v3" or
+                self.target_model_config.hf_text_config.model_type \
+                    == "mimo"):
+            # use the draft model from the same model:
+            self.model = self.target_model_config.model
+        elif self.method in ("ngram", "[ngram]"):
+            self.model = "ngram"
+        else:
+            raise ValueError("num_speculative_tokens was provided without "
+                             "speculative model.")
+
+    # Automatically configure the method for ngram when "model" is used
+    # instead of "method"
+    if self.method is None and (self.model is not None
+                                and self.model in ("ngram", "[ngram]")):
+        self.method = "ngram"
+
+    if self.method in ("ngram", "[ngram]"):
+        # Unified to "ngram" internally
+        self.method = "ngram"
+        # Set default values if not provided
+        if (self.prompt_lookup_min is None and self.prompt_lookup_max is None):
+            # TODO(woosuk): Tune these values. They are arbitrarily chosen.
+            self.prompt_lookup_min = 5
+            self.prompt_lookup_max = 5
+        elif self.prompt_lookup_min is None:
+            assert self.prompt_lookup_max is not None
+            self.prompt_lookup_min = self.prompt_lookup_max
+        elif self.prompt_lookup_max is None:
+            assert self.prompt_lookup_min is not None
+            self.prompt_lookup_max = self.prompt_lookup_min
+
+        # Validate values
+        if self.prompt_lookup_min < 1:
+            raise ValueError(
+                f"prompt_lookup_min={self.prompt_lookup_min} must be > 0")
+        if self.prompt_lookup_max < 1:
+            raise ValueError(
+                f"prompt_lookup_max={self.prompt_lookup_max} must be > 0")
+        if self.prompt_lookup_min > self.prompt_lookup_max:
+            raise ValueError(
+                f"prompt_lookup_min={self.prompt_lookup_min} must "
+                f"be <= prompt_lookup_max={self.prompt_lookup_max}")
+
+        # TODO: current we still need extract vocab_size from target model
+        # config, in future, we may try refactor it out, and set
+        # draft related config as None here.
+        self.draft_model_config = self.target_model_config
+        self.draft_parallel_config = self.target_parallel_config
+    else:
+        self.prompt_lookup_max = 0
+        self.prompt_lookup_min = 0
+
+        if self.model is not None:
+            self.draft_model_config = ModelConfig(
+                model=self.model,
+                task="draft",
+                tokenizer=self.target_model_config.tokenizer,
+                tokenizer_mode=self.target_model_config.tokenizer_mode,
+                trust_remote_code=self.target_model_config.trust_remote_code,
+                allowed_local_media_path=self.target_model_config.
+                allowed_local_media_path,
+                dtype=self.target_model_config.dtype,
+                seed=self.target_model_config.seed,
+                revision=self.revision,
+                code_revision=self.code_revision,
+                tokenizer_revision=self.target_model_config.tokenizer_revision,
+                spec_target_max_model_len=self.target_model_config.
+                max_model_len,
+                quantization=self.quantization,
+                enforce_eager=self.target_model_config.enforce_eager,
+                max_seq_len_to_capture=self.target_model_config.
+                max_seq_len_to_capture,
+                max_logprobs=self.target_model_config.max_logprobs,
+                hf_overrides=SpeculativeConfig.hf_config_override,
+            )
+
+            # Automatically detect the method
+            if self.method in ('eagle', 'eagle3'):
+                pass
+            elif "eagle-" in self.draft_model_config.model.lower() or \
+                    "eagle3-" in self.draft_model_config.model.lower():
+                self.method = "eagle"
+            elif self.draft_model_config.hf_config.model_type == "medusa":
+                self.method = "medusa"
+            elif (self.draft_model_config.hf_config.model_type ==
+                  "mlp_speculator"):
+                self.method = "mlp_speculator"
+            elif self.draft_model_config.hf_config.model_type == "deepseek_mtp":
+                self.method = 'mtp'
+            else:
+                self.method = "draft_model"
+
+            # Replace hf_config for EAGLE draft_model
+            if self.method in ("eagle", "eagle3"):
+                if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
+                    raise ValueError(
+                        "Chunked prefill and EAGLE are not compatible "
+                        "when using V0.")
+
+                from vllm.platforms import current_platform
+                from vllm.transformers_utils.configs.eagle import EAGLEConfig
+                if isinstance(self.draft_model_config.hf_config,
+                              EAGLEConfig) or current_platform.is_neuron():
+                    pass
+                else:
+                    eagle_config = EAGLEConfig(
+                        self.draft_model_config.hf_config, method=self.method)
+                    self.draft_model_config.hf_config = eagle_config
+
+            if (self.num_speculative_tokens is not None
+                    and hasattr(self.draft_model_config.hf_config,
+                                "num_lookahead_tokens")):
+                self.draft_model_config.hf_config.num_lookahead_tokens = \
+                self.num_speculative_tokens
+
+            n_predict = getattr(self.draft_model_config.hf_config, "n_predict",
+                                None)
+            if n_predict is not None:
+                if self.num_speculative_tokens is None:
+                    # Default to max value defined in draft model config.
+                    self.num_speculative_tokens = n_predict
+                elif self.num_speculative_tokens > n_predict and \
+                        self.num_speculative_tokens % n_predict != 0:
+                    # Ensure divisibility for MTP module reuse.
+                    raise ValueError(
+                        f"num_speculative_tokens:{self.num_speculative_tokens}"
+                        f" must be divisible by {n_predict=}")
+
+            self.draft_tensor_parallel_size = \
+                SpeculativeConfig._verify_and_get_draft_tp(
+                    self.target_parallel_config,
+                    self.draft_tensor_parallel_size,
+                    self.draft_model_config.hf_config
+            )
+
+            self.draft_model_config.max_model_len = (
+                SpeculativeConfig._maybe_override_draft_max_model_len(
+                    self.max_model_len,
+                    self.draft_model_config.max_model_len,
+                    self.target_model_config.max_model_len,
+                ))
+
+            self.draft_parallel_config = (
+                SpeculativeConfig.create_draft_parallel_config(
+                    self.target_parallel_config,
+                    self.draft_tensor_parallel_size))
+
+    if self.acceptance_method == "typical_acceptance_sampler":
+        if self.posterior_threshold is None:
+            self.posterior_threshold = 0.09
+        if self.posterior_alpha is None:
+            self.posterior_alpha = 0.3
+
+    self._verify_args()
+
+
+SpeculativeConfig.__post_init__ = __post_init__
Original file line number	Diff line number	Diff line change
`@@ -15,4 +15,6 @@`
`15`	`15`	`# limitations under the License.`
`16`	`16`	`#`
`17`	`17`
	`18`	`+import vllm_ascend.patch.platform.patch_common.patch_arg_utils`
	`19`	`+import vllm_ascend.patch.platform.patch_common.patch_config`
`18`	`20`	`import vllm_ascend.patch.platform.patch_common.patch_distributed # noqa`