Skip to content

[V0 Deprecation] Remove prompt adapter in V1 #1683

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ norecursedirs =
vllm-empty/tests/kv_transfer
vllm-empty/tests/plugins
vllm-empty/tests/plugins_tests
vllm-empty/tests/prompt_adapter
vllm-empty/tests/compile
vllm-empty/tests/lora
vllm-empty/tests/models
Expand Down
17 changes: 1 addition & 16 deletions vllm_ascend/worker/draft_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest):
1. Only decodes
2. Only flash-attn
3. No LORA
4. No prompt_adapter_config
"""
if not allow_gpu_advance_step:
return False
Expand All @@ -155,11 +154,7 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest):
return False

# TODO: Add support for LORA
if self.lora_config:
return False

# TODO: Add soft-tuning prompt adapter support
return not self.prompt_adapter_config
return not self.lora_config

def set_indices_of_seq_with_bonus_tokens(self,
indices_of_seq_with_bonus_tokens):
Expand Down Expand Up @@ -202,9 +197,6 @@ def execute_model(
# Sanity
if self.lora_config is not None:
raise ValueError("TP1DraftModelRunner has no support for LORA")
if self.prompt_adapter_config is not None:
raise ValueError("TP1DraftModelRunner has no support for "
"prompt_adapter_config")
if model_input.inputs_embeds is not None:
raise ValueError("TP1DraftModelRunner has no support for "
"inputs_embeds")
Expand All @@ -219,13 +211,6 @@ def execute_model(
self.set_active_loras(model_input.lora_requests,
model_input.lora_mapping)

if self.prompt_adapter_config:
assert model_input.prompt_adapter_requests is not None
assert model_input.prompt_adapter_mapping is not None
self.set_active_prompt_adapters(
model_input.prompt_adapter_requests,
model_input.prompt_adapter_mapping)

self.attn_state.begin_forward(model_input)

# Detect exec mode
Expand Down
23 changes: 0 additions & 23 deletions vllm_ascend/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For V0 code, this file can be removed totally. I think we can remove V0 first. Then remove the related prompt adapter code in V1

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK.

MultiModalKwargs, MultiModalPlaceholderMap,
MultiModalRegistry)
from vllm.prompt_adapter.layers import PromptAdapterMapping
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, flatten_2d_lists,
Expand Down Expand Up @@ -1237,27 +1235,6 @@ def list_loras(self) -> Set[int]:
raise RuntimeError("LoRA is not enabled.")
return self.lora_manager.list_adapters()

def remove_all_prompt_adapters(self):
raise RuntimeError("PromptAdapter is not supported on NPU now.")

def set_active_prompt_adapters(
self, prompt_adapter_requests: Set[PromptAdapterRequest],
prompt_adapter_mapping: PromptAdapterMapping) -> None:
raise RuntimeError("PromptAdapter is not supported on NPU now.")

def add_prompt_adapter(
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
raise RuntimeError("PromptAdapter is not supported on NPU now.")

def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
raise RuntimeError("PromptAdapter is not supported on NPU now.")

def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
raise RuntimeError("PromptAdapter is not supported on NPU now.")

def list_prompt_adapters(self) -> Set[int]:
raise RuntimeError("PromptAdapter is not supported on NPU now.")

@property
def vocab_size(self) -> int:
return self.model_config.get_vocab_size()
Expand Down
3 changes: 0 additions & 3 deletions vllm_ascend/worker/multi_step_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,6 @@ def maybe_advance_frozen_model_input(self, device: str, pin_memory: bool):
# assert fmi.lora_mapping is None
# assert fmi.lora_requests is not None
# assert len(fmi.lora_requests) == 0
# assert fmi.prompt_adapter_mapping is None
# assert fmi.prompt_adapter_requests is not None
# assert len(fmi.prompt_adapter_requests) == 0
assert fmi.attn_metadata is not None
assert fmi.multi_modal_kwargs is not None
assert len(fmi.multi_modal_kwargs) == 0
Expand Down
7 changes: 0 additions & 7 deletions vllm_ascend/worker/pooling_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,6 @@ def execute_model(
self.set_active_loras(model_input.lora_requests,
model_input.lora_mapping)

if self.prompt_adapter_config:
assert model_input.prompt_adapter_requests is not None
assert model_input.prompt_adapter_mapping is not None
self.set_active_prompt_adapters(
model_input.prompt_adapter_requests,
model_input.prompt_adapter_mapping)

assert model_input.attn_metadata is not None
virtual_engine = model_input.virtual_engine
model_executable = self.model
Expand Down
18 changes: 0 additions & 18 deletions vllm_ascend/worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
from vllm.model_executor import set_random_seed
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
SequenceGroupMetadata, SequenceGroupMetadataDelta)
from vllm.utils import GiB_bytes, bind_kv_cache, get_ip
Expand Down Expand Up @@ -503,23 +502,6 @@ def pin_lora(self, lora_id: int) -> bool:
def list_loras(self) -> Set[int]:
return self.model_runner.list_loras()

def add_prompt_adapter(
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
raise NotImplementedError(
"Prompt Adapter is not implemented for NPU backend currently.")

def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
raise NotImplementedError(
"Prompt Adapter is not implemented for NPU backend currently.")

def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
raise NotImplementedError(
"Prompt Adapter is not implemented for NPU backend currently.")

def list_prompt_adapters(self) -> Set[int]:
raise NotImplementedError(
"Prompt Adapter is not implemented for NPU backend currently.")

@property
def max_model_len(self) -> int:
return self.model_config.max_model_len
Expand Down
Loading