From 896a22f9342b4683269f39f857cd6f4d664e770a Mon Sep 17 00:00:00 2001 From: Shanshan Shen <87969357+shen-shanshan@users.noreply.github.com> Date: Wed, 9 Jul 2025 10:53:53 +0800 Subject: [PATCH] remove v0 prompt adapter Signed-off-by: Shanshan Shen <87969357+shen-shanshan@users.noreply.github.com> --- pytest.ini | 1 - vllm_ascend/worker/draft_model_runner.py | 17 +--------------- vllm_ascend/worker/model_runner.py | 23 ---------------------- vllm_ascend/worker/multi_step_runner.py | 3 --- vllm_ascend/worker/pooling_model_runner.py | 7 ------- vllm_ascend/worker/worker.py | 18 ----------------- 6 files changed, 1 insertion(+), 68 deletions(-) diff --git a/pytest.ini b/pytest.ini index e59921e55d..e32b265bef 100644 --- a/pytest.ini +++ b/pytest.ini @@ -11,7 +11,6 @@ norecursedirs = vllm-empty/tests/kv_transfer vllm-empty/tests/plugins vllm-empty/tests/plugins_tests - vllm-empty/tests/prompt_adapter vllm-empty/tests/compile vllm-empty/tests/lora vllm-empty/tests/models diff --git a/vllm_ascend/worker/draft_model_runner.py b/vllm_ascend/worker/draft_model_runner.py index b070da1a7f..44812efcb7 100644 --- a/vllm_ascend/worker/draft_model_runner.py +++ b/vllm_ascend/worker/draft_model_runner.py @@ -139,7 +139,6 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest): 1. Only decodes 2. Only flash-attn 3. No LORA - 4. No prompt_adapter_config """ if not allow_gpu_advance_step: return False @@ -155,11 +154,7 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest): return False # TODO: Add support for LORA - if self.lora_config: - return False - - # TODO: Add soft-tuning prompt adapter support - return not self.prompt_adapter_config + return not self.lora_config def set_indices_of_seq_with_bonus_tokens(self, indices_of_seq_with_bonus_tokens): @@ -202,9 +197,6 @@ def execute_model( # Sanity if self.lora_config is not None: raise ValueError("TP1DraftModelRunner has no support for LORA") - if self.prompt_adapter_config is not None: - raise ValueError("TP1DraftModelRunner has no support for " - "prompt_adapter_config") if model_input.inputs_embeds is not None: raise ValueError("TP1DraftModelRunner has no support for " "inputs_embeds") @@ -219,13 +211,6 @@ def execute_model( self.set_active_loras(model_input.lora_requests, model_input.lora_mapping) - if self.prompt_adapter_config: - assert model_input.prompt_adapter_requests is not None - assert model_input.prompt_adapter_mapping is not None - self.set_active_prompt_adapters( - model_input.prompt_adapter_requests, - model_input.prompt_adapter_mapping) - self.attn_state.begin_forward(model_input) # Detect exec mode diff --git a/vllm_ascend/worker/model_runner.py b/vllm_ascend/worker/model_runner.py index 48c5d4b68f..9cc74ff7c5 100644 --- a/vllm_ascend/worker/model_runner.py +++ b/vllm_ascend/worker/model_runner.py @@ -52,8 +52,6 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalKwargs, MultiModalPlaceholderMap, MultiModalRegistry) -from vllm.prompt_adapter.layers import PromptAdapterMapping -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, flatten_2d_lists, @@ -1237,27 +1235,6 @@ def list_loras(self) -> Set[int]: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.list_adapters() - def remove_all_prompt_adapters(self): - raise RuntimeError("PromptAdapter is not supported on NPU now.") - - def set_active_prompt_adapters( - self, prompt_adapter_requests: Set[PromptAdapterRequest], - prompt_adapter_mapping: PromptAdapterMapping) -> None: - raise RuntimeError("PromptAdapter is not supported on NPU now.") - - def add_prompt_adapter( - self, prompt_adapter_request: PromptAdapterRequest) -> bool: - raise RuntimeError("PromptAdapter is not supported on NPU now.") - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise RuntimeError("PromptAdapter is not supported on NPU now.") - - def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise RuntimeError("PromptAdapter is not supported on NPU now.") - - def list_prompt_adapters(self) -> Set[int]: - raise RuntimeError("PromptAdapter is not supported on NPU now.") - @property def vocab_size(self) -> int: return self.model_config.get_vocab_size() diff --git a/vllm_ascend/worker/multi_step_runner.py b/vllm_ascend/worker/multi_step_runner.py index 028bcd05df..83e993ba7d 100644 --- a/vllm_ascend/worker/multi_step_runner.py +++ b/vllm_ascend/worker/multi_step_runner.py @@ -91,9 +91,6 @@ def maybe_advance_frozen_model_input(self, device: str, pin_memory: bool): # assert fmi.lora_mapping is None # assert fmi.lora_requests is not None # assert len(fmi.lora_requests) == 0 - # assert fmi.prompt_adapter_mapping is None - # assert fmi.prompt_adapter_requests is not None - # assert len(fmi.prompt_adapter_requests) == 0 assert fmi.attn_metadata is not None assert fmi.multi_modal_kwargs is not None assert len(fmi.multi_modal_kwargs) == 0 diff --git a/vllm_ascend/worker/pooling_model_runner.py b/vllm_ascend/worker/pooling_model_runner.py index e1262fb0a2..85d3780bdb 100644 --- a/vllm_ascend/worker/pooling_model_runner.py +++ b/vllm_ascend/worker/pooling_model_runner.py @@ -116,13 +116,6 @@ def execute_model( self.set_active_loras(model_input.lora_requests, model_input.lora_mapping) - if self.prompt_adapter_config: - assert model_input.prompt_adapter_requests is not None - assert model_input.prompt_adapter_mapping is not None - self.set_active_prompt_adapters( - model_input.prompt_adapter_requests, - model_input.prompt_adapter_mapping) - assert model_input.attn_metadata is not None virtual_engine = model_input.virtual_engine model_executable = self.model diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index bffc6a8de8..8ecea21bee 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -37,7 +37,6 @@ from vllm.model_executor import set_random_seed from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, SequenceGroupMetadata, SequenceGroupMetadataDelta) from vllm.utils import GiB_bytes, bind_kv_cache, get_ip @@ -503,23 +502,6 @@ def pin_lora(self, lora_id: int) -> bool: def list_loras(self) -> Set[int]: return self.model_runner.list_loras() - def add_prompt_adapter( - self, prompt_adapter_request: PromptAdapterRequest) -> bool: - raise NotImplementedError( - "Prompt Adapter is not implemented for NPU backend currently.") - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Prompt Adapter is not implemented for NPU backend currently.") - - def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Prompt Adapter is not implemented for NPU backend currently.") - - def list_prompt_adapters(self) -> Set[int]: - raise NotImplementedError( - "Prompt Adapter is not implemented for NPU backend currently.") - @property def max_model_len(self) -> int: return self.model_config.max_model_len