From cf9ee08f65f7355d007057b578dedc0724ded444 Mon Sep 17 00:00:00 2001 From: mgoin Date: Mon, 7 Jul 2025 18:36:21 +0000 Subject: [PATCH 1/4] [V0 Deprecation] Remove Prompt Adapters Signed-off-by: mgoin --- docs/api/README.md | 1 - pyproject.toml | 1 - tests/entrypoints/openai/test_completion.py | 56 +-- .../entrypoints/openai/test_serving_models.py | 3 +- tests/prompt_adapter/test_bloom.py | 48 --- .../test_multi_adapter_inference.py | 56 --- tests/prompt_adapter/test_pa_lora.py | 64 ---- tools/mypy.sh | 1 - vllm/config.py | 62 --- vllm/core/scheduler.py | 12 - vllm/engine/arg_utils.py | 43 +-- vllm/engine/async_llm_engine.py | 10 - vllm/engine/llm_engine.py | 68 +--- vllm/engine/multiprocessing/__init__.py | 4 - vllm/engine/multiprocessing/client.py | 9 +- vllm/engine/multiprocessing/engine.py | 14 +- vllm/engine/protocol.py | 2 - vllm/entrypoints/llm.py | 52 +-- vllm/entrypoints/logger.py | 7 +- vllm/entrypoints/openai/api_server.py | 1 - vllm/entrypoints/openai/cli_args.py | 35 +- vllm/entrypoints/openai/run_batch.py | 1 - vllm/entrypoints/openai/serving_chat.py | 9 +- .../openai/serving_classification.py | 10 +- vllm/entrypoints/openai/serving_completion.py | 9 +- vllm/entrypoints/openai/serving_embedding.py | 9 +- vllm/entrypoints/openai/serving_engine.py | 32 +- vllm/entrypoints/openai/serving_models.py | 31 -- vllm/entrypoints/openai/serving_pooling.py | 12 +- vllm/entrypoints/openai/serving_responses.py | 9 +- vllm/entrypoints/openai/serving_score.py | 26 +- .../openai/serving_tokenization.py | 21 +- vllm/entrypoints/openai/speech_to_text.py | 12 +- vllm/executor/executor_base.py | 31 -- vllm/inputs/preprocess.py | 35 +- vllm/prompt_adapter/__init__.py | 0 vllm/prompt_adapter/layers.py | 83 ---- vllm/prompt_adapter/models.py | 358 ------------------ vllm/prompt_adapter/request.py | 37 -- vllm/prompt_adapter/utils.py | 98 ----- vllm/prompt_adapter/worker_manager.py | 179 --------- vllm/sequence.py | 39 +- vllm/spec_decode/draft_model_runner.py | 17 +- vllm/utils/__init__.py | 5 - vllm/v1/engine/async_llm.py | 7 +- vllm/v1/engine/llm_engine.py | 5 +- vllm/v1/engine/processor.py | 6 - vllm/v1/utils.py | 2 - vllm/v1/worker/gpu_model_runner.py | 1 - vllm/v1/worker/tpu_model_runner.py | 1 - vllm/v1/worker/tpu_worker.py | 1 - vllm/worker/enc_dec_model_runner.py | 7 +- vllm/worker/hpu_worker.py | 18 - vllm/worker/model_runner.py | 151 +------- vllm/worker/model_runner_base.py | 1 - vllm/worker/multi_step_model_runner.py | 3 - vllm/worker/pooling_model_runner.py | 7 - vllm/worker/utils.py | 4 - vllm/worker/worker.py | 14 - vllm/worker/worker_base.py | 1 - 60 files changed, 95 insertions(+), 1746 deletions(-) delete mode 100644 tests/prompt_adapter/test_bloom.py delete mode 100644 tests/prompt_adapter/test_multi_adapter_inference.py delete mode 100644 tests/prompt_adapter/test_pa_lora.py delete mode 100644 vllm/prompt_adapter/__init__.py delete mode 100644 vllm/prompt_adapter/layers.py delete mode 100644 vllm/prompt_adapter/models.py delete mode 100644 vllm/prompt_adapter/request.py delete mode 100644 vllm/prompt_adapter/utils.py delete mode 100644 vllm/prompt_adapter/worker_manager.py diff --git a/docs/api/README.md b/docs/api/README.md index 5c7b2ca79ee..d5265de4c0a 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -15,7 +15,6 @@ API documentation for vLLM's configuration classes. - [vllm.config.DeviceConfig][] - [vllm.config.SpeculativeConfig][] - [vllm.config.LoRAConfig][] -- [vllm.config.PromptAdapterConfig][] - [vllm.config.MultiModalConfig][] - [vllm.config.PoolerConfig][] - [vllm.config.DecodingConfig][] diff --git a/pyproject.toml b/pyproject.toml index 340abb38565..a85cf243386 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,6 @@ line-length = 80 "vllm/core/**/*.py" = ["UP006", "UP035"] "vllm/engine/**/*.py" = ["UP006", "UP035"] "vllm/executor/**/*.py" = ["UP006", "UP035"] -"vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"] "vllm/spec_decode/**/*.py" = ["UP006", "UP035"] "vllm/worker/**/*.py" = ["UP006", "UP035"] # Python 3.8 typing - skip utils for ROCm diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 7933ca5cd6c..e8b2b3e8f1a 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -25,10 +25,6 @@ # technically these adapters use a different base model, # but we're not testing generation quality here LORA_NAME = "typeof/zephyr-7b-beta-lora" -PA_NAME = "swapnilbp/llama_tweet_ptune" -# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also -# need to change to match the prompt adapter -PA_NUM_VIRTUAL_TOKENS = 8 GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"] @@ -55,13 +51,7 @@ def zephyr_lora_added_tokens_files(zephyr_lora_files): @pytest.fixture(scope="module") -def zephyr_pa_files(): - return snapshot_download(repo_id=PA_NAME) - - -@pytest.fixture(scope="module") -def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files, - zephyr_pa_files): +def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files): return [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -80,15 +70,6 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files, "64", "--max-cpu-loras", "2", - # pa config - "--enable-prompt-adapter", - "--prompt-adapters", - f"zephyr-pa={zephyr_pa_files}", - f"zephyr-pa2={zephyr_pa_files}", - "--max-prompt-adapters", - "2", - "--max-prompt-adapter-token", - "128", ] @@ -109,14 +90,11 @@ async def client(server): @pytest.mark.asyncio @pytest.mark.parametrize( - # first test base model, then test loras, then test prompt adapters - "model_name,num_virtual_tokens", - [(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0), - ("zephyr-pa", PA_NUM_VIRTUAL_TOKENS), - ("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)], + # first test base model, then test loras + "model_name", + [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], ) -async def test_single_completion(client: openai.AsyncOpenAI, model_name: str, - num_virtual_tokens: int): +async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): completion = await client.completions.create(model=model_name, prompt="Hello, my name is", max_tokens=5, @@ -129,9 +107,7 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str, assert len(choice.text) >= 5 assert choice.finish_reason == "length" assert completion.usage == openai.types.CompletionUsage( - completion_tokens=5, - prompt_tokens=6 + num_virtual_tokens, - total_tokens=11 + num_virtual_tokens) + completion_tokens=5, prompt_tokens=6, total_tokens=11) # test using token IDs completion = await client.completions.create( @@ -174,9 +150,9 @@ async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI): @pytest.mark.asyncio @pytest.mark.parametrize( - # first test base model, then test loras, then test prompt adapters + # first test base model, then test loras "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"], + [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], ) async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs @@ -193,9 +169,9 @@ async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( - # just test 1 lora and 1 pa hereafter + # just test 1 lora "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-pa"], + [MODEL_NAME, "zephyr-lora"], ) async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs @@ -216,7 +192,7 @@ async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-pa"], + [MODEL_NAME, "zephyr-lora"], ) async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs @@ -237,7 +213,7 @@ async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-pa"], + [MODEL_NAME, "zephyr-lora"], ) async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, model_name: str): @@ -313,7 +289,7 @@ async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-pa"], + [MODEL_NAME, "zephyr-lora"], ) async def test_completion_streaming(client: openai.AsyncOpenAI, model_name: str): @@ -347,7 +323,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-pa"], + [MODEL_NAME, "zephyr-lora"], ) async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str): """Streaming for parallel sampling. @@ -381,7 +357,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-pa"], + [MODEL_NAME, "zephyr-lora"], ) async def test_completion_stream_options(client: openai.AsyncOpenAI, model_name: str): @@ -518,7 +494,7 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-pa"], + [MODEL_NAME, "zephyr-lora"], ) async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): # test both text and token IDs diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index 28af6489a4d..6f0d2d90953 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -32,8 +32,7 @@ async def _async_serving_models_init() -> OpenAIServingModels: serving_models = OpenAIServingModels(engine_client=mock_engine_client, base_model_paths=BASE_MODEL_PATHS, model_config=mock_model_config, - lora_modules=None, - prompt_adapters=None) + lora_modules=None) await serving_models.init_static_loras() return serving_models diff --git a/tests/prompt_adapter/test_bloom.py b/tests/prompt_adapter/test_bloom.py deleted file mode 100644 index 2b603fe8f02..00000000000 --- a/tests/prompt_adapter/test_bloom.py +++ /dev/null @@ -1,48 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -import vllm -from vllm.prompt_adapter.request import PromptAdapterRequest - -MODEL_PATH = "bigscience/bloomz-560m" -PA_PATH = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM' - - -def do_sample(llm, pa_name: str, pa_id: int): - - prompts = [ - "Tweet text : @nationalgridus I have no water and the bill is \ - current and paid. Can you do something about this? Label : ", - "Tweet text : @nationalgridus Looks good thanks! Label : " - ] - sampling_params = vllm.SamplingParams(temperature=0.0, - max_tokens=3, - stop_token_ids=[3]) - - outputs = llm.generate(prompts, - sampling_params, - prompt_adapter_request=PromptAdapterRequest( - pa_name, pa_id, PA_PATH, 8) if pa_id else None) - - # Print the outputs. - generated_texts = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text.strip() - generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - return generated_texts - - -@pytest.mark.parametrize("enforce_eager", [True, False]) -def test_twitter_prompt_adapter(enforce_eager: bool): - llm = vllm.LLM(MODEL_PATH, - enforce_eager=enforce_eager, - enable_prompt_adapter=True, - max_prompt_adapter_token=8) - - expected_output = ['complaint', 'no complaint'] - - assert do_sample(llm, "twitter_pa", pa_id=1) == expected_output diff --git a/tests/prompt_adapter/test_multi_adapter_inference.py b/tests/prompt_adapter/test_multi_adapter_inference.py deleted file mode 100644 index 4f273afb4e3..00000000000 --- a/tests/prompt_adapter/test_multi_adapter_inference.py +++ /dev/null @@ -1,56 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from vllm import EngineArgs, LLMEngine, SamplingParams -from vllm.prompt_adapter.request import PromptAdapterRequest - -MODEL_PATH = "bigscience/bloomz-560m" -pa_path = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM' -pa_path2 = 'swapnilbp/angry_tweet_ptune' - - -def do_sample(engine): - - prompts = [ - ("Tweet text: I have complaints! Label: ", - SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]), - PromptAdapterRequest("hate_speech", 1, pa_path2, 8)), - ("Tweet text: I have no problems Label: ", - SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]), - PromptAdapterRequest("hate_speech2", 2, pa_path2, 8)), - ("Tweet text: I have complaints! Label: ", - SamplingParams(temperature=0.0, max_tokens=3), None), - ("Tweet text: I have no problems Label: ", - SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]), - PromptAdapterRequest("complain", 3, pa_path, 8)), - ] - - request_id = 0 - results = set() - while prompts or engine.has_unfinished_requests(): - if prompts: - prompt, sampling_params, pa_request = prompts.pop(0) - engine.add_request(str(request_id), - prompt, - sampling_params, - prompt_adapter_request=pa_request) - request_id += 1 - - request_outputs = engine.step() - - for request_output in request_outputs: - if request_output.finished: - results.add(request_output.outputs[0].text) - return results - - -def test_multi_prompt_adapters(): - engine_args = EngineArgs(model=MODEL_PATH, - max_prompt_adapters=3, - enable_prompt_adapter=True, - max_prompt_adapter_token=8) - engine = LLMEngine.from_engine_args(engine_args) - expected_output = { - ' quot;I', 'hate speech', 'no complaint', 'not hate speech' - } - assert do_sample(engine) == expected_output diff --git a/tests/prompt_adapter/test_pa_lora.py b/tests/prompt_adapter/test_pa_lora.py deleted file mode 100644 index ba2e15b81bc..00000000000 --- a/tests/prompt_adapter/test_pa_lora.py +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from huggingface_hub import snapshot_download - -from vllm import EngineArgs, LLMEngine, SamplingParams -from vllm.lora.request import LoRARequest -from vllm.prompt_adapter.request import PromptAdapterRequest - -MODEL_PATH = "meta-llama/Llama-2-7b-hf" -pa_path = snapshot_download(repo_id="swapnilbp/llama_tweet_ptune") -lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") - - -def do_sample(engine): - - prompt_text = "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]" # noqa: E501 - - # first prompt with a prompt adapter and second without adapter - prompts = [ - (prompt_text, - SamplingParams(temperature=0.0, max_tokens=100, - stop=["[/assistant]"]), - PromptAdapterRequest("hate_speech", 1, pa_path, - 8), LoRARequest("sql_test", 1, lora_path)), - (prompt_text, - SamplingParams(temperature=0.0, max_tokens=100, - stop=["[/assistant]"]), None, - LoRARequest("sql_test", 1, lora_path)), - ] - - request_id = 0 - results = set() - while prompts or engine.has_unfinished_requests(): - if prompts: - prompt, sampling_params, pa_request, lora_request = prompts.pop(0) - engine.add_request(str(request_id), - prompt, - sampling_params, - prompt_adapter_request=pa_request, - lora_request=lora_request) - request_id += 1 - - request_outputs = engine.step() - - for request_output in request_outputs: - if request_output.finished: - results.add(request_output.outputs[0].text) - return results - - -def test_lora_prompt_adapter(): - engine_args = EngineArgs(model=MODEL_PATH, - enable_prompt_adapter=True, - enable_lora=True, - max_num_seqs=60, - max_prompt_adapter_token=8) - engine = LLMEngine.from_engine_args(engine_args) - result = do_sample(engine) - - expected_output = { - " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' " # noqa: E501 - } - assert result == expected_output diff --git a/tools/mypy.sh b/tools/mypy.sh index 77d342da1ec..595cfe05662 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -31,7 +31,6 @@ run_mypy vllm/inputs run_mypy vllm/lora run_mypy vllm/model_executor run_mypy vllm/plugins -run_mypy vllm/prompt_adapter run_mypy vllm/spec_decode run_mypy vllm/worker run_mypy vllm/v1 diff --git a/vllm/config.py b/vllm/config.py index b7ba434db91..91f7e5a91ff 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3037,59 +3037,6 @@ def verify_lora_support(self): "V1 LoRA does not support long LoRA, please use V0.") -@config -@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) -class PromptAdapterConfig: - """Configuration for PromptAdapters.""" - - max_prompt_adapters: int = 1 - """Max number of PromptAdapters in a batch.""" - max_prompt_adapter_token: int = 0 - """Max number of PromptAdapters tokens.""" - max_cpu_prompt_adapters: Optional[int] = None - """Maximum number of PromptAdapters to store in CPU memory. Must be >= than - `max_prompt_adapters`.""" - prompt_adapter_dtype: Union[torch.dtype, str] = "auto" - """Data type for PromptAdapter. If auto, will default to base model dtype. - """ - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - # no factors to consider. - # this config will not affect the computation graph. - factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() - return hash_str - - def __post_init__(self): - - if self.max_prompt_adapters < 1: - raise ValueError(f"max_prompt_adapters " - f"({self.max_prompt_adapters}) must be >= 1.") - if self.max_prompt_adapter_token == 0: - raise ValueError("max_prompt_adapter_token must be set.") - if self.max_cpu_prompt_adapters is None: - self.max_cpu_prompt_adapters = self.max_prompt_adapters - - def verify_with_model_config(self, model_config: ModelConfig): - if self.prompt_adapter_dtype == "auto": - self.prompt_adapter_dtype = model_config.dtype - elif isinstance(self.prompt_adapter_dtype, str): - self.prompt_adapter_dtype = getattr(torch, - self.prompt_adapter_dtype) - - @config @dataclass class MultiModalConfig: @@ -4326,8 +4273,6 @@ class VllmConfig: """Decoding configuration.""" observability_config: Optional[ObservabilityConfig] = None """Observability configuration.""" - prompt_adapter_config: Optional[PromptAdapterConfig] = None - """Prompt adapter configuration.""" quant_config: Optional[QuantizationConfig] = None """Quantization configuration.""" compilation_config: CompilationConfig = field( @@ -4424,10 +4369,6 @@ def compute_hash(self) -> str: vllm_factors.append(self.observability_config.compute_hash()) else: vllm_factors.append("None") - if self.prompt_adapter_config: - vllm_factors.append(self.prompt_adapter_config.compute_hash()) - else: - vllm_factors.append("None") if self.quant_config: pass # should be captured by model_config.quantization if self.compilation_config: @@ -4536,9 +4477,6 @@ def __post_init__(self): self.lora_config.verify_with_cache_config(self.cache_config) self.lora_config.verify_with_model_config(self.model_config) self.lora_config.verify_lora_support() - if self.prompt_adapter_config is not None: - self.prompt_adapter_config.verify_with_model_config( - self.model_config) if self.quant_config is None and self.model_config is not None: self.quant_config = VllmConfig._get_quantization_config( diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 0ef0396996b..61346da145b 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -15,7 +15,6 @@ from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (Sequence, SequenceData, SequenceGroup, SequenceGroupBase, SequenceGroupMetadata, SequenceGroupMetadataDelta, SequenceStage, @@ -165,8 +164,6 @@ def __post_init__(self): if self.num_loras > 0: self._sort_by_lora_ids() - self.num_prompt_adapters: int = len(self.prompt_adapter_requests) - def is_empty(self) -> bool: # NOTE: We do not consider the ignored sequence groups. return (not self.scheduled_seq_groups and not self.blocks_to_swap_in @@ -194,14 +191,6 @@ def lora_requests(self) -> Set[LoRARequest]: if g.seq_group.lora_request is not None } - @property - def prompt_adapter_requests(self) -> Set[PromptAdapterRequest]: - return { - g.seq_group.prompt_adapter_request - for g in self.scheduled_seq_groups - if g.seq_group.prompt_adapter_request is not None - } - @dataclass class SchedulerRunningOutputs: @@ -1648,7 +1637,6 @@ def schedule( multi_modal_placeholders=( seq_group.multi_modal_placeholders if scheduler_outputs.num_prefill_groups > 0 else None), - prompt_adapter_request=seq_group.prompt_adapter_request, ) else: # When SPMD mode is enabled, we only send delta data except for diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cf94b6a6428..d4de3001af2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -29,10 +29,10 @@ KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig, ModelConfig, ModelDType, ModelImpl, MultiModalConfig, ObservabilityConfig, ParallelConfig, PoolerConfig, - PrefixCachingHashAlgo, PromptAdapterConfig, - SchedulerConfig, SchedulerPolicy, SpeculativeConfig, - TaskOption, TokenizerMode, TokenizerPoolConfig, - VllmConfig, get_attr_docs, get_field) + PrefixCachingHashAlgo, SchedulerConfig, + SchedulerPolicy, SpeculativeConfig, TaskOption, + TokenizerMode, TokenizerPoolConfig, VllmConfig, + get_attr_docs, get_field) from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationMethods @@ -388,11 +388,6 @@ class EngineArgs: lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size long_lora_scaling_factors: Optional[tuple[float, ...]] = \ LoRAConfig.long_lora_scaling_factors - # PromptAdapter fields - enable_prompt_adapter: bool = False - max_prompt_adapters: int = PromptAdapterConfig.max_prompt_adapters - max_prompt_adapter_token: int = \ - PromptAdapterConfig.max_prompt_adapter_token device: Device = DeviceConfig.device num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps @@ -792,23 +787,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: lora_group.add_argument("--fully-sharded-loras", **lora_kwargs["fully_sharded_loras"]) - # PromptAdapter related configs - prompt_adapter_kwargs = get_kwargs(PromptAdapterConfig) - prompt_adapter_group = parser.add_argument_group( - title="PromptAdapterConfig", - description=PromptAdapterConfig.__doc__, - ) - prompt_adapter_group.add_argument( - "--enable-prompt-adapter", - action=argparse.BooleanOptionalAction, - help="If True, enable handling of PromptAdapters.") - prompt_adapter_group.add_argument( - "--max-prompt-adapters", - **prompt_adapter_kwargs["max_prompt_adapters"]) - prompt_adapter_group.add_argument( - "--max-prompt-adapter-token", - **prompt_adapter_kwargs["max_prompt_adapter_token"]) - # Device arguments device_kwargs = get_kwargs(DeviceConfig) device_group = parser.add_argument_group( @@ -1262,11 +1240,6 @@ def create_engine_config( load_config = self.create_load_config() - prompt_adapter_config = PromptAdapterConfig( - max_prompt_adapters=self.max_prompt_adapters, - max_prompt_adapter_token=self.max_prompt_adapter_token) \ - if self.enable_prompt_adapter else None - decoding_config = DecodingConfig( backend=self.guided_decoding_backend, disable_fallback=self.guided_decoding_disable_fallback, @@ -1294,7 +1267,6 @@ def create_engine_config( load_config=load_config, decoding_config=decoding_config, observability_config=observability_config, - prompt_adapter_config=prompt_adapter_config, compilation_config=self.compilation_config, kv_transfer_config=self.kv_transfer_config, kv_events_config=self.kv_events_config, @@ -1381,12 +1353,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: recommend_to_remove=False) return False - # No Prompt Adapter so far. - if self.enable_prompt_adapter: - _raise_or_fallback(feature_name="--enable-prompt-adapter", - recommend_to_remove=False) - return False - # No text embedding inputs so far. if self.enable_prompt_embeds: _raise_or_fallback(feature_name="--enable-prompt-embeds", @@ -1525,7 +1491,6 @@ def _set_default_args_v0(self, model_config: ModelConfig) -> None: if (is_gpu and not use_sliding_window and not use_spec_decode and not self.enable_lora - and not self.enable_prompt_adapter and model_config.runner_type != "pooling"): self.enable_chunked_prefill = True logger.warning( diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 3d7d28055dd..cbff032cdaa 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -29,7 +29,6 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.sequence import ExecuteModelRequest from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -435,7 +434,6 @@ async def add_request_async( arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, data_parallel_rank: Optional[int] = None, ) -> None: @@ -467,7 +465,6 @@ async def add_request_async( processed_inputs = await self.input_preprocessor.preprocess_async( prompt, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, ) if isinstance(params, SamplingParams) and \ @@ -489,7 +486,6 @@ async def add_request_async( params=params, arrival_time=arrival_time, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, trace_headers=trace_headers, priority=priority, ) @@ -859,7 +855,6 @@ async def add_request( arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, data_parallel_rank: Optional[int] = None, ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: @@ -886,7 +881,6 @@ async def add_request( arrival_time=arrival_time or time.time(), lora_request=lora_request, trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request, priority=priority, data_parallel_rank=data_parallel_rank, ) @@ -900,7 +894,6 @@ async def generate( request_id: str, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, data_parallel_rank: Optional[int] = None, ) -> AsyncGenerator[RequestOutput, None]: @@ -918,8 +911,6 @@ async def generate( request_id: The unique id of the request. lora_request: LoRA request to use for generation, if any. trace_headers: OpenTelemetry trace headers. - prompt_adapter_request: Prompt Adapter request to use - for generation, if any. priority: The priority of the request. Only applicable with priority scheduling. data_parallel_rank: The (global) data parallel rank that must @@ -979,7 +970,6 @@ async def generate( sampling_params, lora_request=lora_request, trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request, priority=priority, data_parallel_rank=data_parallel_rank, ): diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 25fa1c3058b..df37d173ea8 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -44,7 +44,6 @@ from vllm.outputs import (PoolingRequestOutput, RequestOutput, RequestOutputFactory) from vllm.pooling_params import PoolingParams -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup, PoolingSequenceGroupOutput, Sequence, SequenceGroup, @@ -223,7 +222,6 @@ def __init__( self.load_config = vllm_config.load_config self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa ) - self.prompt_adapter_config = vllm_config.prompt_adapter_config # noqa self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa ) @@ -294,8 +292,6 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: # Feature flags "enable_lora": bool(self.lora_config), - "enable_prompt_adapter": - bool(self.prompt_adapter_config), "enable_prefix_caching": self.cache_config.enable_prefix_caching, "enforce_eager": @@ -542,9 +538,6 @@ def _verify_args(self) -> None: self.lora_config.verify_with_model_config(self.model_config) self.lora_config.verify_with_scheduler_config( self.scheduler_config) - if self.prompt_adapter_config: - self.prompt_adapter_config.verify_with_model_config( - self.model_config) def _add_processed_request( self, @@ -553,7 +546,6 @@ def _add_processed_request( params: Union[SamplingParams, PoolingParams], arrival_time: float, lora_request: Optional[LoRARequest], - prompt_adapter_request: Optional[PromptAdapterRequest], trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, ) -> Optional[SequenceGroup]: @@ -569,7 +561,6 @@ def _add_processed_request( arrival_time=arrival_time, lora_request=lora_request, trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request, priority=priority, ) return None @@ -583,11 +574,10 @@ def _add_processed_request( encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id, - lora_request, prompt_adapter_request) + lora_request) encoder_seq = (None if encoder_inputs is None else Sequence( - seq_id, encoder_inputs, block_size, eos_token_id, lora_request, - prompt_adapter_request)) + seq_id, encoder_inputs, block_size, eos_token_id, lora_request)) # Create a SequenceGroup based on SamplingParams or PoolingParams if isinstance(params, SamplingParams): @@ -598,7 +588,6 @@ def _add_processed_request( arrival_time=arrival_time, lora_request=lora_request, trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request, encoder_seq=encoder_seq, priority=priority) elif isinstance(params, PoolingParams): @@ -608,7 +597,6 @@ def _add_processed_request( params, arrival_time=arrival_time, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, encoder_seq=encoder_seq, priority=priority) else: @@ -637,7 +625,6 @@ def add_request( lora_request: Optional[LoRARequest] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> None: """Add a request to the engine's request pool. @@ -658,7 +645,6 @@ def add_request( the current monotonic time. lora_request: The LoRA request to add. trace_headers: OpenTelemetry trace headers. - prompt_adapter_request: The prompt adapter request to add. priority: The priority of the request. Only applicable with priority scheduling. @@ -719,7 +705,6 @@ def add_request( prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, ) self._add_processed_request( @@ -728,7 +713,6 @@ def add_request( params=params, arrival_time=arrival_time, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, trace_headers=trace_headers, priority=priority, ) @@ -741,7 +725,6 @@ def _create_sequence_group_with_sampling( arrival_time: float, lora_request: Optional[LoRARequest], trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, encoder_seq: Optional[Sequence] = None, priority: int = 0, ) -> SequenceGroup: @@ -769,17 +752,15 @@ def _create_sequence_group_with_sampling( if self.vllm_config.speculative_config is not None: draft_size = \ self.vllm_config.speculative_config.num_speculative_tokens + 1 - seq_group = SequenceGroup( - request_id=request_id, - seqs=[seq], - arrival_time=arrival_time, - sampling_params=sampling_params, - lora_request=lora_request, - trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request, - encoder_seq=encoder_seq, - priority=priority, - draft_size=draft_size) + seq_group = SequenceGroup(request_id=request_id, + seqs=[seq], + arrival_time=arrival_time, + sampling_params=sampling_params, + lora_request=lora_request, + trace_headers=trace_headers, + encoder_seq=encoder_seq, + priority=priority, + draft_size=draft_size) return seq_group @@ -790,7 +771,6 @@ def _create_sequence_group_with_pooling( pooling_params: PoolingParams, arrival_time: float, lora_request: Optional[LoRARequest], - prompt_adapter_request: Optional[PromptAdapterRequest], encoder_seq: Optional[Sequence] = None, priority: int = 0, ) -> SequenceGroup: @@ -798,15 +778,13 @@ def _create_sequence_group_with_pooling( # Defensive copy of PoolingParams, which are used by the pooler pooling_params = pooling_params.clone() # Create the sequence group. - seq_group = SequenceGroup( - request_id=request_id, - seqs=[seq], - arrival_time=arrival_time, - lora_request=lora_request, - pooling_params=pooling_params, - prompt_adapter_request=prompt_adapter_request, - encoder_seq=encoder_seq, - priority=priority) + seq_group = SequenceGroup(request_id=request_id, + seqs=[seq], + arrival_time=arrival_time, + lora_request=lora_request, + pooling_params=pooling_params, + encoder_seq=encoder_seq, + priority=priority) return seq_group def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: @@ -1842,16 +1820,6 @@ def list_loras(self) -> Set[int]: def pin_lora(self, lora_id: int) -> bool: return self.model_executor.pin_lora(lora_id) - def add_prompt_adapter( - self, prompt_adapter_request: PromptAdapterRequest) -> bool: - return self.model_executor.add_prompt_adapter(prompt_adapter_request) - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - return self.model_executor.remove_prompt_adapter(prompt_adapter_id) - - def list_prompt_adapters(self) -> List[int]: - return self.model_executor.list_prompt_adapters() - def start_profile(self) -> None: self.model_executor.start_profile() diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index db968cd6b5d..ff0405d2f84 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -10,7 +10,6 @@ from vllm.inputs import PromptType from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.utils import Device @@ -33,7 +32,6 @@ class RPCProcessRequest: request_id: str lora_request: Optional[LoRARequest] = None trace_headers: Optional[Mapping[str, str]] = None - prompt_adapter_request: Optional[PromptAdapterRequest] = None priority: int = 0 def __init__( @@ -43,7 +41,6 @@ def __init__( request_id: str, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> None: super().__init__() @@ -53,7 +50,6 @@ def __init__( self.request_id = request_id self.lora_request = lora_request self.trace_headers = trace_headers - self.prompt_adapter_request = prompt_adapter_request self.priority = priority diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 9e018ec7f34..67d9a3bf6ce 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -45,7 +45,6 @@ from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput from vllm.outputs import PoolingRequestOutput, RequestOutput -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.utils import Device @@ -448,7 +447,6 @@ def generate( request_id: str, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> AsyncGenerator[RequestOutput, None]: """Generate outputs for a request. @@ -465,8 +463,6 @@ def generate( request_id: The unique id of the request. lora_request: LoRA request to use for generation, if any. trace_headers: OpenTelemetry trace headers. - prompt_adapter_request: Prompt Adapter request to use - for generation, if any. priority: Priority of the request (lower means earlier handling). Any priority other than 0 will lead to an error if the scheduling policy is not "priority". @@ -474,8 +470,7 @@ def generate( return cast( AsyncGenerator[RequestOutput, None], self._process_request(prompt, sampling_params, request_id, - lora_request, trace_headers, - prompt_adapter_request, priority)) + lora_request, trace_headers, priority)) def encode( self, @@ -521,7 +516,6 @@ async def _process_request( request_id: str, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> Union[AsyncGenerator[RequestOutput, None], AsyncGenerator[ PoolingRequestOutput, None]]: @@ -575,7 +569,6 @@ async def _process_request( request_id=request_id, lora_request=lora_request, trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request, priority=priority, )) diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index ef088bd3933..fe6eb0d8c2f 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -304,14 +304,12 @@ def _handle_process_request(self, request: RPCProcessRequest): self._send_outputs(rpc_err) try: - self.engine.add_request( - request_id=request_id, - prompt=request.prompt, - params=request.params, - lora_request=request.lora_request, - trace_headers=request.trace_headers, - prompt_adapter_request=request.prompt_adapter_request, - priority=request.priority) + self.engine.add_request(request_id=request_id, + prompt=request.prompt, + params=request.params, + lora_request=request.lora_request, + trace_headers=request.trace_headers, + priority=request.priority) if self.log_requests: logger.info("Added request %s.", request.request_id) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 8688fcc82cd..85a261ac551 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -16,7 +16,6 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import Device, collect_from_async_generator, random_uuid @@ -55,7 +54,6 @@ def generate( request_id: str, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> AsyncGenerator[RequestOutput, None]: """Generate outputs for a request.""" diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 16c051d61de..83c346203ce 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -42,7 +42,6 @@ PoolingRequestOutput, RequestOutput, ScoringRequestOutput) from vllm.pooling_params import PoolingParams -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, RequestOutputKind, SamplingParams) from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, @@ -311,7 +310,6 @@ def generate( *, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, ) -> list[RequestOutput]: @@ -327,7 +325,6 @@ def generate( prompt_token_ids: Optional[list[int]] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, ) -> list[RequestOutput]: @@ -343,7 +340,6 @@ def generate( prompt_token_ids: Optional[list[list[int]]] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, ) -> list[RequestOutput]: @@ -360,7 +356,6 @@ def generate( prompt_token_ids: list[int], use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, ) -> list[RequestOutput]: @@ -377,7 +372,6 @@ def generate( prompt_token_ids: list[list[int]], use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, ) -> list[RequestOutput]: @@ -392,7 +386,6 @@ def generate( prompt_token_ids: Union[list[int], list[list[int]]], use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, ) -> list[RequestOutput]: @@ -412,7 +405,6 @@ def generate( prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, guided_options_request: Optional[Union[LLMGuidedOptions, GuidedDecodingRequest]] = None, priority: Optional[list[int]] = None, @@ -437,8 +429,6 @@ def generate( it is used to create the progress bar. If `False`, no progress bar is created. lora_request: LoRA request to use for generation, if any. - prompt_adapter_request: Prompt Adapter request to use for - generation, if any. priority: The priority of the requests, if any. Only applicable when priority scheduling policy is enabled. @@ -501,7 +491,6 @@ def generate( params=sampling_params, use_tqdm=use_tqdm, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, guided_options=guided_options_request, tokenization_kwargs=tokenization_kwargs, priority=priority, @@ -880,7 +869,6 @@ def encode( truncate_prompt_tokens: Optional[int] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[PoolingRequestOutput]: ... @@ -895,7 +883,6 @@ def encode( truncate_prompt_tokens: Optional[int] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[PoolingRequestOutput]: ... @@ -910,7 +897,6 @@ def encode( truncate_prompt_tokens: Optional[int] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[PoolingRequestOutput]: ... @@ -926,7 +912,6 @@ def encode( truncate_prompt_tokens: Optional[int] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[PoolingRequestOutput]: ... @@ -942,7 +927,6 @@ def encode( truncate_prompt_tokens: Optional[int] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[PoolingRequestOutput]: ... @@ -956,7 +940,6 @@ def encode( truncate_prompt_tokens: Optional[int] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[PoolingRequestOutput]: ... @@ -975,7 +958,6 @@ def encode( truncate_prompt_tokens: Optional[int] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[PoolingRequestOutput]: """Apply pooling to the hidden states corresponding to the input prompts. @@ -995,8 +977,6 @@ def encode( it is used to create the progress bar. If `False`, no progress bar is created. lora_request: LoRA request to use for generation, if any. - prompt_adapter_request: Prompt Adapter request to use for - generation, if any. Returns: A list of `PoolingRequestOutput` objects containing the @@ -1050,7 +1030,6 @@ def encode( use_tqdm=use_tqdm, lora_request=lora_request, tokenization_kwargs=tokenization_kwargs, - prompt_adapter_request=prompt_adapter_request, ) outputs = self._run_engine(use_tqdm=use_tqdm) @@ -1067,7 +1046,6 @@ def embed( pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[EmbeddingRequestOutput]: """ Generate an embedding vector for each prompt. @@ -1087,8 +1065,6 @@ def embed( it is used to create the progress bar. If `False`, no progress bar is created. lora_request: LoRA request to use for generation, if any. - prompt_adapter_request: Prompt Adapter request to use for - generation, if any. Returns: A list of `EmbeddingRequestOutput` objects containing the @@ -1102,8 +1078,7 @@ def embed( truncate_prompt_tokens=truncate_prompt_tokens, use_tqdm=use_tqdm, pooling_params=pooling_params, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) + lora_request=lora_request) return [EmbeddingRequestOutput.from_base(item) for item in items] @@ -1114,7 +1089,6 @@ def classify( *, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[ClassificationRequestOutput]: """ Generate class logits for each prompt. @@ -1132,8 +1106,6 @@ def classify( it is used to create the progress bar. If `False`, no progress bar is created. lora_request: LoRA request to use for generation, if any. - prompt_adapter_request: Prompt Adapter request to use for - generation, if any. Returns: A list of `ClassificationRequestOutput` objects containing the @@ -1145,8 +1117,7 @@ def classify( items = self.encode(prompts, use_tqdm=use_tqdm, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) + lora_request=lora_request) return [ClassificationRequestOutput.from_base(item) for item in items] @@ -1158,15 +1129,13 @@ def _embedding_score( truncate_prompt_tokens: Optional[int] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[ScoringRequestOutput]: encoded_output: list[PoolingRequestOutput] = self.encode( text_1 + text_2, truncate_prompt_tokens=truncate_prompt_tokens, use_tqdm=use_tqdm, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) + lora_request=lora_request) encoded_output_1: list[PoolingRequestOutput] = encoded_output[ 0:len(text_1)] @@ -1192,7 +1161,6 @@ def _cross_encoding_score( truncate_prompt_tokens: Optional[int] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[ScoringRequestOutput]: if isinstance(tokenizer, MistralTokenizer): @@ -1230,7 +1198,6 @@ def _cross_encoding_score( params=pooling_params, use_tqdm=use_tqdm, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, ) outputs = self._run_engine(use_tqdm=use_tqdm) @@ -1248,7 +1215,6 @@ def score( truncate_prompt_tokens: Optional[int] = None, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> list[ScoringRequestOutput]: """Generate similarity scores for all pairs ``. @@ -1271,8 +1237,6 @@ def score( it is used to create the progress bar. If `False`, no progress bar is created. lora_request: LoRA request to use for generation, if any. - prompt_adapter_request: Prompt Adapter request to use for - generation, if any. Returns: A list of `ScoringRequestOutput` objects containing the @@ -1335,8 +1299,7 @@ def ensure_str(prompt: SingletonPrompt): return self._cross_encoding_score(tokenizer, input_text_1, input_text_2, truncate_prompt_tokens, use_tqdm, - lora_request, - prompt_adapter_request) + lora_request) else: return self._embedding_score( tokenizer, @@ -1344,8 +1307,7 @@ def ensure_str(prompt: SingletonPrompt): input_text_2, # type: ignore[arg-type] truncate_prompt_tokens, use_tqdm, - lora_request, - prompt_adapter_request) + lora_request) def start_profile(self) -> None: self.llm_engine.start_profile() @@ -1456,7 +1418,6 @@ def _validate_and_add_requests( *, use_tqdm: Union[bool, Callable[..., tqdm]] = True, lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]], - prompt_adapter_request: Optional[PromptAdapterRequest], tokenization_kwargs: Optional[dict[str, Any]] = None, guided_options: Optional[GuidedDecodingRequest] = None, priority: Optional[list[int]] = None, @@ -1502,7 +1463,6 @@ def _validate_and_add_requests( tokenization_kwargs=tokenization_kwargs, lora_request=lora_request[i] if isinstance( lora_request, Sequence) else lora_request, - prompt_adapter_request=prompt_adapter_request, priority=priority[i] if priority else 0, ) @@ -1512,7 +1472,6 @@ def _add_request( params: Union[SamplingParams, PoolingParams], tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> None: request_id = str(next(self.request_counter)) @@ -1522,7 +1481,6 @@ def _add_request( params, lora_request=lora_request, tokenization_kwargs=tokenization_kwargs, - prompt_adapter_request=prompt_adapter_request, priority=priority, ) diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py index f3aee188dae..06ff3b417f8 100644 --- a/vllm/entrypoints/logger.py +++ b/vllm/entrypoints/logger.py @@ -8,7 +8,6 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.pooling_params import PoolingParams -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import BeamSearchParams, SamplingParams logger = init_logger(__name__) @@ -30,7 +29,6 @@ def log_inputs( params: Optional[Union[SamplingParams, PoolingParams, BeamSearchParams]], lora_request: Optional[LoRARequest], - prompt_adapter_request: Optional[PromptAdapterRequest], ) -> None: max_log_len = self.max_log_len if max_log_len is not None: @@ -44,7 +42,6 @@ def log_inputs( "Received request %s: prompt: %r, " "params: %s, prompt_token_ids: %s, " "prompt_embeds shape: %s, " - "lora_request: %s, prompt_adapter_request: %s.", request_id, - prompt, params, prompt_token_ids, + "lora_request: %s.", request_id, prompt, params, prompt_token_ids, prompt_embeds.shape if prompt_embeds is not None else None, - lora_request, prompt_adapter_request) + lora_request) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index e3285a9bf76..426e76a88a1 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1340,7 +1340,6 @@ async def init_app_state( model_config=model_config, base_model_paths=base_model_paths, lora_modules=args.lora_modules, - prompt_adapters=args.prompt_adapters, ) await state.openai_serving_models.init_static_loras() state.openai_serving_responses = OpenAIServingResponses( diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 4f8aaab772f..4ad6844fda1 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -16,8 +16,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs, optional_type from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, validate_chat_template) -from vllm.entrypoints.openai.serving_models import (LoRAModulePath, - PromptAdapterPath) +from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.logger import init_logger from vllm.utils import FlexibleArgumentParser @@ -61,27 +60,6 @@ def __call__( setattr(namespace, self.dest, lora_list) -class PromptAdapterParserAction(argparse.Action): - - def __call__( - self, - parser: argparse.ArgumentParser, - namespace: argparse.Namespace, - values: Optional[Union[str, Sequence[str]]], - option_string: Optional[str] = None, - ): - if values is None: - values = [] - if isinstance(values, str): - raise TypeError("Expected values to be a list") - - adapter_list: list[PromptAdapterPath] = [] - for item in values: - name, path = item.split('=') - adapter_list.append(PromptAdapterPath(name, path)) - setattr(namespace, self.dest, adapter_list) - - def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument("--host", type=optional_type(str), @@ -129,14 +107,6 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "Example (new format): " "``{\"name\": \"name\", \"path\": \"lora_path\", " "\"base_model_name\": \"id\"}``") - parser.add_argument( - "--prompt-adapters", - type=optional_type(str), - default=None, - nargs='+', - action=PromptAdapterParserAction, - help="Prompt adapter configurations in the format name=path. " - "Multiple adapters can be specified.") parser.add_argument("--chat-template", type=optional_type(str), default=None, @@ -311,9 +281,6 @@ def validate_parsed_serve_args(args: argparse.Namespace): if args.enable_auto_tool_choice and not args.tool_call_parser: raise TypeError("Error: --enable-auto-tool-choice requires " "--tool-call-parser") - if args.enable_prompt_embeds and args.enable_prompt_adapter: - raise ValueError( - "Cannot use prompt embeds and prompt adapter at the same time.") def log_non_default_args(args: argparse.Namespace): diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index e112e2f893a..a9fed18c60e 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -337,7 +337,6 @@ async def main(args): model_config=model_config, base_model_paths=base_model_paths, lora_modules=None, - prompt_adapters=None, ) openai_serving_chat = OpenAIServingChat( engine, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index a802fbc3865..dc92d4d2f1f 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -150,10 +150,7 @@ async def create_chat_completion( raise self.engine_client.dead_error try: - ( - lora_request, - prompt_adapter_request, - ) = self._maybe_get_adapters(request) + lora_request = self._maybe_get_adapters(request) model_name = self._get_model_name(request.model, lora_request) @@ -255,8 +252,7 @@ async def create_chat_completion( self._log_inputs(request_id, request_prompts[i], params=sampling_params, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) + lora_request=lora_request) trace_headers = (None if raw_request is None else await self._get_trace_headers(raw_request.headers)) @@ -275,7 +271,6 @@ async def create_chat_completion( request_id, lora_request=lora_request, trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request, priority=request.priority, ) diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py index 3ac4f01ea60..68556864049 100644 --- a/vllm/entrypoints/openai/serving_classification.py +++ b/vllm/entrypoints/openai/serving_classification.py @@ -46,19 +46,11 @@ async def _preprocess( return None try: - ( - ctx.lora_request, - ctx.prompt_adapter_request, - ) = self._maybe_get_adapters(ctx.request) + ctx.lora_request = self._maybe_get_adapters(ctx.request) ctx.tokenizer = await self.engine_client.get_tokenizer( ctx.lora_request) - if ctx.prompt_adapter_request is not None: - raise NotImplementedError( - "Prompt adapter is not supported for classification models" - ) - ( ctx.request_prompts, ctx.engine_prompts, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 6c9c29b7144..2b6a97f2d87 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -113,10 +113,7 @@ async def create_completion( raw_request.state.request_metadata = request_metadata try: - ( - lora_request, - prompt_adapter_request, - ) = self._maybe_get_adapters(request) + lora_request = self._maybe_get_adapters(request) tokenizer = await self.engine_client.get_tokenizer(lora_request) @@ -184,8 +181,7 @@ async def create_completion( self._log_inputs(request_id_item, request_prompts[i], params=sampling_params, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) + lora_request=lora_request) trace_headers = (None if raw_request is None else await self._get_trace_headers(raw_request.headers)) @@ -208,7 +204,6 @@ async def create_completion( sampling_params, request_id_item, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, trace_headers=trace_headers, priority=request.priority, ) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index e87decfe636..3a216a0fe06 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -51,18 +51,11 @@ async def _preprocess( ) -> Optional[ErrorResponse]: ctx = cast(EmbeddingServeContext, ctx) try: - ( - ctx.lora_request, - ctx.prompt_adapter_request, - ) = self._maybe_get_adapters(ctx.request) + ctx.lora_request = self._maybe_get_adapters(ctx.request) tokenizer = await self.engine_client.get_tokenizer(ctx.lora_request ) - if ctx.prompt_adapter_request is not None: - raise NotImplementedError("Prompt adapter is not supported " - "for embedding models") - if isinstance(ctx.request, EmbeddingChatRequest): ( _, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index bec2e125479..e35994a4875 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -73,7 +73,6 @@ MultiModalDataDict) from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.sequence import Logprob, PromptLogprobs from vllm.tracing import (contains_trace_headers, extract_trace_headers, @@ -166,7 +165,6 @@ class ServeContext(RequestProcessingMixin, ResponseGenerationMixin, BaseModel, request_id: str created_time: int = Field(default_factory=lambda: int(time.time())) lora_request: Optional[LoRARequest] = None - prompt_adapter_request: Optional[PromptAdapterRequest] = None # Shared across most requests tokenizer: Optional[AnyTokenizer] = None @@ -340,12 +338,10 @@ async def _prepare_generators( return self.create_error_response( "Request prompts not available") - self._log_inputs( - request_id_item, - ctx.request_prompts[i], - params=pooling_params, - lora_request=ctx.lora_request, - prompt_adapter_request=ctx.prompt_adapter_request) + self._log_inputs(request_id_item, + ctx.request_prompts[i], + params=pooling_params, + lora_request=ctx.lora_request) # Mypy has an existing bug related to inferring the variance of # TypedDicts with `builtins.enumerate`: @@ -449,29 +445,19 @@ async def _check_model( if isinstance(load_result, ErrorResponse) and \ load_result.code == HTTPStatus.BAD_REQUEST.value: error_response = load_result - if request.model in [ - prompt_adapter.prompt_adapter_name - for prompt_adapter in self.models.prompt_adapter_requests - ]: - return None return error_response or self.create_error_response( message=f"The model `{request.model}` does not exist.", err_type="NotFoundError", status_code=HTTPStatus.NOT_FOUND) - def _maybe_get_adapters( - self, request: AnyRequest - ) -> Union[tuple[None, None], tuple[LoRARequest, None], tuple[ - None, PromptAdapterRequest]]: + def _maybe_get_adapters(self, + request: AnyRequest) -> Optional[LoRARequest]: if self._is_model_supported(request.model): - return None, None + return None for lora in self.models.lora_requests: if request.model == lora.lora_name: - return lora, None - for prompt_adapter in self.models.prompt_adapter_requests: - if request.model == prompt_adapter.prompt_adapter_name: - return None, prompt_adapter + return lora # if _check_model has been called earlier, this will be unreachable raise ValueError(f"The model `{request.model}` does not exist.") @@ -924,7 +910,6 @@ def _log_inputs( params: Optional[Union[SamplingParams, PoolingParams, BeamSearchParams]], lora_request: Optional[LoRARequest], - prompt_adapter_request: Optional[PromptAdapterRequest], ) -> None: if self.request_logger is None: return @@ -946,7 +931,6 @@ def _log_inputs( prompt_embeds, params=params, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, ) async def _get_trace_headers( diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 764b0e73690..3eec50ffd0f 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import json -import pathlib from asyncio import Lock from collections import defaultdict from dataclasses import dataclass @@ -19,7 +17,6 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.utils import AtomicCounter logger = init_logger(__name__) @@ -31,12 +28,6 @@ class BaseModelPath: model_path: str -@dataclass -class PromptAdapterPath: - name: str - local_path: str - - @dataclass class LoRAModulePath: name: str @@ -60,7 +51,6 @@ def __init__( base_model_paths: list[BaseModelPath], *, lora_modules: Optional[list[LoRAModulePath]] = None, - prompt_adapters: Optional[list[PromptAdapterPath]] = None, ): super().__init__() @@ -80,20 +70,6 @@ def __init__( LoRAResolverRegistry.get_resolver(lora_resolver_name)) self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock) - self.prompt_adapter_requests = [] - if prompt_adapters is not None: - for i, prompt_adapter in enumerate(prompt_adapters, start=1): - with pathlib.Path(prompt_adapter.local_path, - "adapter_config.json").open() as f: - adapter_config = json.load(f) - num_virtual_tokens = adapter_config["num_virtual_tokens"] - self.prompt_adapter_requests.append( - PromptAdapterRequest( - prompt_adapter_name=prompt_adapter.name, - prompt_adapter_id=i, - prompt_adapter_local_path=prompt_adapter.local_path, - prompt_adapter_num_virtual_tokens=num_virtual_tokens)) - async def init_static_loras(self): """Loads all static LoRA modules. Raises if any fail to load""" @@ -140,14 +116,7 @@ async def show_available_models(self) -> ModelList: permission=[ModelPermission()]) for lora in self.lora_requests ] - prompt_adapter_cards = [ - ModelCard(id=prompt_adapter.prompt_adapter_name, - root=self.base_model_paths[0].name, - permission=[ModelPermission()]) - for prompt_adapter in self.prompt_adapter_requests - ] model_cards.extend(lora_cards) - model_cards.extend(prompt_adapter_cards) return ModelList(data=model_cards) async def load_lora_adapter( diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index c2ed50d04d1..6c1e43e338d 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -94,17 +94,10 @@ async def create_pooling( try: truncate_prompt_tokens = _validate_truncation_size( self.max_model_len, truncate_prompt_tokens) - ( - lora_request, - prompt_adapter_request, - ) = self._maybe_get_adapters(request) + lora_request = self._maybe_get_adapters(request) tokenizer = await self.engine_client.get_tokenizer(lora_request) - if prompt_adapter_request is not None: - raise NotImplementedError("Prompt adapter is not supported " - "for pooling models") - if isinstance(request, PoolingChatRequest): ( _, @@ -148,8 +141,7 @@ async def create_pooling( self._log_inputs(request_id_item, request_prompts[i], params=pooling_params, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) + lora_request=lora_request) trace_headers = (None if raw_request is None else await self._get_trace_headers(raw_request.headers)) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index ac2b3dfafec..dd4bf160ee0 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -134,10 +134,7 @@ async def create_responses( messages = self._construct_input_messages(request, prev_response) try: - ( - lora_request, - prompt_adapter_request, - ) = self._maybe_get_adapters(request) + lora_request = self._maybe_get_adapters(request) model_name = self._get_model_name(request.model, lora_request) tokenizer = await self.engine_client.get_tokenizer(lora_request) @@ -170,8 +167,7 @@ async def create_responses( self._log_inputs(request.request_id, request_prompts[i], params=sampling_params, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) + lora_request=lora_request) trace_headers = (None if raw_request is None else await self._get_trace_headers(raw_request.headers)) @@ -182,7 +178,6 @@ async def create_responses( request.request_id, lora_request=lora_request, trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request, priority=request.priority, ) generators.append(generator) diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 8b2e3e507c4..9e6a7d294e6 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -24,7 +24,6 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.utils import make_async, merge_async_iterators @@ -52,11 +51,9 @@ async def _embedding_score( texts_1: list[str], texts_2: list[str], request: Union[RerankRequest, ScoreRequest], - request_id=str, + request_id: str, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[Union[LoRARequest, None]] = None, - prompt_adapter_request: Optional[Union[PromptAdapterRequest, - None]] = None, trace_headers: Optional[Mapping[str, str]] = None, ) -> list[PoolingRequestOutput]: @@ -93,8 +90,7 @@ async def _embedding_score( self._log_inputs(request_id_item, input_texts[i], params=pooling_params, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) + lora_request=lora_request) generators.append( self.engine_client.encode( @@ -143,11 +139,9 @@ async def _cross_encoding_score( texts_1: list[str], texts_2: list[str], request: Union[RerankRequest, ScoreRequest], - request_id=str, + request_id: str, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[Union[LoRARequest, None]] = None, - prompt_adapter_request: Optional[Union[PromptAdapterRequest, - None]] = None, trace_headers: Optional[Mapping[str, str]] = None, ) -> list[PoolingRequestOutput]: @@ -206,8 +200,7 @@ async def _cross_encoding_score( self._log_inputs(request_id_item, request_prompts[i], params=pooling_params, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) + lora_request=lora_request) generator = self.engine_client.encode( engine_prompt, @@ -241,14 +234,7 @@ async def _run_scoring( truncate_prompt_tokens: Optional[int] = None, ) -> list[PoolingRequestOutput]: - ( - lora_request, - prompt_adapter_request, - ) = self._maybe_get_adapters(request) - - if prompt_adapter_request is not None: - raise NotImplementedError("Prompt adapter is not supported " - "for scoring models") + lora_request = self._maybe_get_adapters(request) tokenizer = await self.engine_client.get_tokenizer(lora_request) @@ -275,7 +261,6 @@ async def _run_scoring( request_id=request_id, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, trace_headers=trace_headers) else: @@ -287,7 +272,6 @@ async def _run_scoring( request_id=request_id, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, trace_headers=trace_headers) async def create_score( diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 3db0a71fadd..8faa92b37f2 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -58,10 +58,7 @@ async def create_tokenize( request_id = f"tokn-{self._base_request_id(raw_request)}" try: - ( - lora_request, - prompt_adapter_request, - ) = self._maybe_get_adapters(request) + lora_request = self._maybe_get_adapters(request) tokenizer = await self.engine_client.get_tokenizer(lora_request) @@ -102,11 +99,8 @@ async def create_tokenize( self._log_inputs(request_id, request_prompts[i], params=None, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) + lora_request=lora_request) - # Silently ignore prompt adapter since it does not affect - # tokenization (Unlike in Embeddings API where an error is raised) if isinstance(engine_prompt, dict) and "prompt_token_ids" in engine_prompt: input_ids.extend(engine_prompt["prompt_token_ids"]) @@ -131,21 +125,14 @@ async def create_detokenize( request_id = f"tokn-{self._base_request_id(raw_request)}" - ( - lora_request, - prompt_adapter_request, - ) = self._maybe_get_adapters(request) + lora_request = self._maybe_get_adapters(request) tokenizer = await self.engine_client.get_tokenizer(lora_request) self._log_inputs(request_id, request.tokens, params=None, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) - - # Silently ignore prompt adapter since it does not affect tokenization - # (Unlike in Embeddings API where an error is raised) + lora_request=lora_request) prompt_input = await self._tokenize_prompt_input_async( request, diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index 0ab029e5305..da611362efb 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -161,19 +161,12 @@ async def _create_speech_to_text( raw_request.state.request_metadata = request_metadata try: - ( - lora_request, - prompt_adapter_request, - ) = self._maybe_get_adapters(request) + lora_request = self._maybe_get_adapters(request) if lora_request: return self.create_error_response( "Currently do not support LoRA for " f"{self.task_type.title()}.") - if prompt_adapter_request: - return self.create_error_response( - f"Currently do not support PromptAdapter for " - f"{self.task_type.title()}.") prompts, duration_s = await self._preprocess_speech_to_text( request=request, @@ -198,8 +191,7 @@ async def _create_speech_to_text( request_id, prompts[0]['decoder_prompt'], # type: ignore params=sampling_params, - lora_request=None, - prompt_adapter_request=None) + lora_request=None) list_result_generator = [ self.engine_client.generate( diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 99e12201c96..bebe7afe5d8 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -15,7 +15,6 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest, PoolerOutput from vllm.utils import make_async from vllm.worker.worker_base import WorkerBase @@ -48,7 +47,6 @@ def __init__( self.scheduler_config = vllm_config.scheduler_config self.device_config = vllm_config.device_config self.speculative_config = vllm_config.speculative_config - self.prompt_adapter_config = vllm_config.prompt_adapter_config self.observability_config = vllm_config.observability_config self._init_executor() self.is_sleeping = False @@ -164,35 +162,6 @@ def list_loras(self) -> Set[int]: assert s == sets[0], "All workers should have the same LORAs." return sets[0] - def add_prompt_adapter( - self, prompt_adapter_request: PromptAdapterRequest) -> bool: - assert prompt_adapter_request.prompt_adapter_id > 0, \ - "prompt_adapter_id must be greater than 0." - return all( - self.collective_rpc("add_prompt_adapter", - args=(prompt_adapter_request, ))) - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - assert prompt_adapter_id > 0, \ - "prompt_adapter_id must be greater than 0." - return all( - self.collective_rpc("remove_prompt_adapter", - args=(prompt_adapter_id, ))) - - def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - assert prompt_adapter_id > 0, \ - "prompt_adapter_id must be greater than 0." - return all( - self.collective_rpc("pin_prompt_adapter", - args=(prompt_adapter_id, ))) - - def list_prompt_adapters(self) -> Set[int]: - sets = self.collective_rpc("list_prompt_adapters") - for s in sets: - assert (s == sets[0] - ), "All workers should have the same prompt adapters." - return sets[0] - def start_profile(self) -> None: self.collective_rpc("start_profile") diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index deda9bc23da..de5dc087665 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -13,7 +13,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs, MultiModalInputs) -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import TokenizerGroup @@ -168,18 +167,6 @@ def _prepare_decoder_input_ids_for_generation( return decoder_input_ids - def _apply_prompt_adapter( - self, - prompt_token_ids: list[int], - prompt_adapter_request: Optional[PromptAdapterRequest], - ) -> list[int]: - if prompt_adapter_request: - prompt_token_ids = ( - [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens - + prompt_token_ids) - - return prompt_token_ids - def _get_tokenization_kw( self, overrides: Optional[dict[str, Any]] = None, @@ -786,15 +773,10 @@ async def _process_encoder_decoder_prompt_async( def _build_decoder_only_llm_inputs( self, prompt_inputs: DecoderOnlyInputs, - prompt_adapter_request: Optional[PromptAdapterRequest], ) -> DecoderOnlyInputs: if "prompt_token_ids" in prompt_inputs: prompt_inputs = cast(Union[TokenInputs, MultiModalInputs], prompt_inputs) # Needed for mypy - prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter( - prompt_inputs["prompt_token_ids"], - prompt_adapter_request=prompt_adapter_request, - ) return prompt_inputs @@ -803,7 +785,6 @@ def _process_decoder_only_prompt( prompt: SingletonPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, ) -> DecoderOnlyInputs: """ @@ -815,7 +796,6 @@ def _process_decoder_only_prompt( * prompt: input prompt * lora_request - * prompt_adapter_request * return_mm_hashes Returns: @@ -830,17 +810,13 @@ def _process_decoder_only_prompt( return_mm_hashes=return_mm_hashes, ) - return self._build_decoder_only_llm_inputs( - prompt_comps, - prompt_adapter_request=prompt_adapter_request, - ) + return self._build_decoder_only_llm_inputs(prompt_comps) async def _process_decoder_only_prompt_async( self, prompt: SingletonPrompt, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, ) -> DecoderOnlyInputs: """ @@ -854,17 +830,13 @@ async def _process_decoder_only_prompt_async( return_mm_hashes=return_mm_hashes, ) - return self._build_decoder_only_llm_inputs( - prompt_comps, - prompt_adapter_request=prompt_adapter_request, - ) + return self._build_decoder_only_llm_inputs(prompt_comps) def preprocess( self, prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, ) -> ProcessorInputs: """Preprocess the input prompt.""" @@ -886,7 +858,6 @@ def preprocess( prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, return_mm_hashes=return_mm_hashes, ) @@ -895,7 +866,6 @@ async def preprocess_async( prompt: PromptType, tokenization_kwargs: Optional[dict[str, Any]] = None, lora_request: Optional[LoRARequest] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, ) -> ProcessorInputs: """ @@ -919,6 +889,5 @@ async def preprocess_async( prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, return_mm_hashes=return_mm_hashes, ) diff --git a/vllm/prompt_adapter/__init__.py b/vllm/prompt_adapter/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py deleted file mode 100644 index b5b925d042f..00000000000 --- a/vllm/prompt_adapter/layers.py +++ /dev/null @@ -1,83 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from dataclasses import dataclass -from typing import Optional - -import torch -from torch import nn - -from vllm.adapter_commons.layers import AdapterMapping -from vllm.config import PromptAdapterConfig -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding) - - -@dataclass -class PromptAdapterMapping(AdapterMapping): - pass - - -class VocabParallelEmbeddingWithPromptAdapter(nn.Module): - - def __init__(self, base_layer: VocabParallelEmbedding) -> None: - super().__init__() - self.base_layer = base_layer - self.emb_layer = self.base_layer - if 'LoRA' in base_layer.__class__.__name__: - self.emb_layer = self.base_layer.base_layer - - def create_prompt_adapter_weights( - self, prompt_adapter_config: PromptAdapterConfig): - self.embeddings_tensors = torch.zeros( - ( - prompt_adapter_config.max_prompt_adapters, - prompt_adapter_config.max_prompt_adapter_token, - self.emb_layer.embedding_dim, - ), - dtype=self.emb_layer.weight.dtype, - device=self.emb_layer.weight.device, - ) - self.adapter_lengths = torch.zeros( - prompt_adapter_config.max_prompt_adapters, - dtype=torch.long, - device=self.emb_layer.weight.device) - - self.indices_gpu: torch.Tensor - self.embedding_indices_gpu: torch.Tensor - - def reset_prompt_adapter(self, index: int): - self.embeddings_tensors[index] = 0 - - def set_prompt_adapter( - self, - index: int, - adapter_model: Optional[torch.Tensor], - ): - self.reset_prompt_adapter(index) - if adapter_model is not None: - length = adapter_model.shape[0] - self.embeddings_tensors[index, :length] = adapter_model - self.adapter_lengths[index] = length - - def set_mapping( - self, - prompt_indices: torch.Tensor, - prompt_embedding_indices: torch.Tensor, - ): - self.indices_gpu = prompt_indices.to( - device=self.emb_layer.weight.device) - self.embedding_indices_gpu = prompt_embedding_indices.to( - device=self.emb_layer.weight.device) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - hidden_states = self.base_layer(x) - if self.embedding_indices_gpu.ndim > 1: - valid_mask = self.indices_gpu != -1 - gathered_embeddings = self.embeddings_tensors[ - self.embedding_indices_gpu[:, 0], - self.embedding_indices_gpu[:, 1]] - - # Update hidden states - hidden_states[valid_mask] = gathered_embeddings - return hidden_states \ No newline at end of file diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py deleted file mode 100644 index 864b50c861e..00000000000 --- a/vllm/prompt_adapter/models.py +++ /dev/null @@ -1,358 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import logging -import math -from typing import Any, Callable, Dict, List, Optional, Type - -import torch -from torch import nn - -from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel, - AdapterModelManager) -from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter, - get_adapter, list_adapters, - remove_adapter, set_adapter_mapping) -from vllm.config import PromptAdapterConfig -from vllm.prompt_adapter.layers import ( - VocabParallelEmbeddingWithPromptAdapter) # yapf: disable -from vllm.prompt_adapter.layers import PromptAdapterMapping -from vllm.prompt_adapter.utils import load_peft_weights - -logger = logging.getLogger(__name__) - -_GLOBAL_PROMPT_ADAPTER_ID = 0 - - -def get_prompt_adapter_id(): - global _GLOBAL_PROMPT_ADAPTER_ID - _GLOBAL_PROMPT_ADAPTER_ID += 1 - return _GLOBAL_PROMPT_ADAPTER_ID - - -def convert_to_embedding_indices(indices): - embedding_indices = [] - count = 0 - - for value in indices: - if value == -1: - count = 0 - else: - embedding_indices.append([value, count]) - count += 1 - - return torch.tensor(embedding_indices) - - -def convert_mapping( - mapping: PromptAdapterMapping, - prompt_adapter_index_to_id: List[Optional[int]], -) -> torch.Tensor: - """Converts PromptAdapterMapping to index tensors. - - Args: - mapping: PromptAdapterMapping mapping rows in a - batch to PromptAdapter ids. - prompt_adapter_index_to_id: List mapping PromptAdapter - ids to PromptAdapter indices. - - Returns: - pa_indices: Tensor of shape [batch_size] mapping batch rows to - PromptAdapter indices. - """ - id_to_index = { - id_: idx - for idx, id_ in enumerate(prompt_adapter_index_to_id) - if id_ is not None - } - pa_indices = ([ - id_to_index.get(id_, -1) if id_ > 0 else -1 - for id_ in mapping.index_mapping - ]) - - pa_embedding_mapping = convert_to_embedding_indices(pa_indices) - pa_indices = torch.tensor(pa_indices) - return pa_indices, pa_embedding_mapping - - -class PromptAdapterModel(AdapterModel): - - def __init__(self, - prompt_adapter_id=None, - num_virtual_tokens=None, - prompt_embedding=None) -> None: - self.id = prompt_adapter_id - self.prompt_embedding = prompt_embedding - self.num_virtual_tokens = num_virtual_tokens - - @classmethod - def from_local_checkpoint( - cls, - adapter_model_path: str, - prompt_adapter_id: int, - num_virtual_tokens: int, - config: PromptAdapterConfig, - device: str = "cuda", - ) -> "PromptAdapterModel": - - if num_virtual_tokens > config.max_prompt_adapter_token: - raise ValueError( - f'num_virtual_tokens ({num_virtual_tokens}) should be <= ' - f'max_prompt_adapter_token({config.max_prompt_adapter_token})') - - adapters_weights = load_peft_weights(adapter_model_path, device) - prompt_embedding = adapters_weights["prompt_embeddings"].to( - config.prompt_adapter_dtype) - - return cls(prompt_adapter_id, num_virtual_tokens, prompt_embedding) - - -class PromptAdapterModelManager(AdapterModelManager): - """A manager that manages multiple Prompt Adapter models.""" - - def __init__( - self, - model: nn.Module, - max_num_seqs: int, - max_num_batched_tokens: int, - prompt_adapter_config: PromptAdapterConfig, - ): - """Create a PromptAdapterModel and adapter for a given model. - - Args: - model: the model to be adapted. - max_num_seqs: the maximum number of sequences model can run in a - single batch. - max_num_batched_tokens: the maximum number of tokens model can run - in a single batch. - prompt_adapter_config: the PromptAdapter config, - """ - self.model: nn.Module = model - # Dict instead of a Set for compatibility with LRUCache. - self.prompt_adapter_index_to_id: List[ - Optional[int]] = [None] * self.prompt_adapter_slots - self.max_num_seqs = max_num_seqs - self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8 - self.prompt_adapter_config = prompt_adapter_config - self.model.prompt_adapter_manager = self - self.adapter_type = 'PromptAdapter' - - self.base_indices = torch.tensor([-1]) - self.base_embedding_indices = torch.tensor([]) - - self.modules: Dict[str, nn.Module] = {} - self._create_prompt_adapter_modules() - self._last_mapping: Optional[PromptAdapterMapping] = None - - @property - def prompt_adapter_slots(self) -> int: - return self.prompt_adapter_config.max_prompt_adapters - - @property - def adapter_slots(self) -> int: - return self.prompt_adapter_slots - - @property - def capacity(self) -> int: - return self.prompt_adapter_config.max_cpu_prompt_adapters - - def activate_adapter( - self, - prompt_adapter_id: int, - ) -> bool: - """Move PromptAdapter into a GPU buffer - to be used in the forward pass.""" - if prompt_adapter_id in self._active_adapters: - return False - first_free_slot = next( - ((i, prompt_adapter_id) for i, prompt_adapter_id in enumerate( - self.prompt_adapter_index_to_id) if prompt_adapter_id is None), - None) - if first_free_slot is None: - raise ValueError("No free prompt_adapter slots") - index, _ = first_free_slot - self._active_adapters[prompt_adapter_id] = None - prompt_adapter_model = (self._registered_adapters[prompt_adapter_id]) - logger.debug("Activating prompt_adapter. int id: %d, slot index: %d", - prompt_adapter_model.id, index) - self.prompt_adapter_index_to_id[index] = prompt_adapter_model.id - for _, v in self.modules.items(): - v.set_prompt_adapter(index, prompt_adapter_model.prompt_embedding) - return True - - def _deactivate_adapter(self, prompt_adapter_id: int): - try: - index = self.prompt_adapter_index_to_id.index(prompt_adapter_id) - self.prompt_adapter_index_to_id[index] = None - for _, v in self.modules.items(): - v.reset_prompt_adapter(index) - except ValueError: - pass - - def _add_adapter(self, prompt_adapter: PromptAdapterModel): - self._registered_adapters[prompt_adapter.id] = prompt_adapter - - def _set_adapter_mapping(self, mapping: PromptAdapterMapping) -> None: - base_indices, base_embedding_indices = convert_mapping( - mapping, self.prompt_adapter_index_to_id) - for k, v in self.modules.items(): - v.set_mapping(base_indices, base_embedding_indices) - - def _create_prompt_adapter_modules(self): - for module_name, module in self.model.named_modules( - remove_duplicate=False): - if "VocabParallel" in module.__class__.__name__: - new_module = VocabParallelEmbeddingWithPromptAdapter(module) - new_module.create_prompt_adapter_weights( - self.prompt_adapter_config) - replaced_module = self.replace_submodule( - self.model, module_name, new_module) - self.register_module(module.__class__.__name__, - replaced_module) - replaced_module.set_mapping(self.base_indices, - self.base_embedding_indices) - break - - def replace_submodule(self, model: nn.Module, module_name: str, - new_module: nn.Module) -> nn.Module: - """Replace a submodule in a model with a new module.""" - parent = model.get_submodule(".".join(module_name.split(".")[:-1])) - target_name = module_name.split(".")[-1] - setattr(parent, target_name, new_module) - return new_module - - def register_module(self, module_name: str, module: nn.Module): - self.modules[module_name] = module - - def pin_adapter(self, prompt_adapter_id: int) -> bool: - """Pin a PromptAdapterModel in the manager cache.""" - raise NotImplementedError( - "Pinning is not supported in PromptAdapterModelManager. " - "Use LRUCachePromptAdapterModelManager for pinning" - ) # type: ignore - - def remove_all_adapters(self): - """Remove all PromptAdapterModel from the manager.""" - self._registered_adapters.clear() - self.prompt_adapter_index_to_id = [None] * self.prompt_adapter_slots - self._active_adapters.clear() - - def deactivate_adapter(self, adapter_id: int) -> bool: - return deactivate_adapter(adapter_id, self._active_adapters, - self._deactivate_adapter) - - def add_adapter(self, adapter: PromptAdapterModel) -> bool: - return add_adapter(adapter, self._registered_adapters, self.capacity, - self._add_adapter) - - def set_adapter_mapping(self, mapping: PromptAdapterMapping) -> None: - self._last_mapping = set_adapter_mapping(mapping, self._last_mapping, - self._set_adapter_mapping) - - def remove_adapter(self, adapter_id: int) -> bool: - return remove_adapter(adapter_id, self._registered_adapters, - self.deactivate_adapter) - - def list_adapters(self) -> Dict[int, Any]: - return list_adapters(self._registered_adapters) - - def get_adapter(self, adapter_id: int) -> Optional[Any]: - return get_adapter(adapter_id, self._registered_adapters) - - -class PromptAdapterLRUCache(AdapterLRUCache[PromptAdapterModel]): - - def __init__(self, capacity: int, - deactivate_prompt_adapter_fn: Callable[[int], bool]): - super().__init__(capacity, deactivate_prompt_adapter_fn) - - -class LRUCachePromptAdapterModelManager(PromptAdapterModelManager): - """A model manager that manages multiple prompt_adapters with LRU cache.""" - - def __init__( - self, - model: nn.Module, - max_num_seqs: int, - max_num_batched_tokens: int, - prompt_adapter_config: PromptAdapterConfig, - ): - self.prompt_adapter_config = prompt_adapter_config - super().__init__(model, max_num_seqs, max_num_batched_tokens, - prompt_adapter_config) - self._registered_adapters = PromptAdapterLRUCache( - self.capacity, self.deactivate_adapter) - self._active_adapters = PromptAdapterLRUCache( - self.prompt_adapter_slots, self._deactivate_adapter) - - def list_adapters(self) -> Dict[int, PromptAdapterModel]: - """List all registered PromptAdapterModel.""" - return dict(self._registered_adapters.cache) - - def add_adapter(self, prompt_adapter: PromptAdapterModel) -> bool: - """Add a PromptAdapterModel to the manager.""" - if prompt_adapter.id not in self._registered_adapters: - self._add_adapter(prompt_adapter) - was_added = True - else: - # We always touch to update the LRU cache order - self._registered_adapters.touch(prompt_adapter.id) - was_added = False - return was_added - - def activate_adapter( - self, - prompt_adapter_id: int, - ) -> bool: - if prompt_adapter_id not in self._active_adapters and len( - self._active_adapters) >= self.prompt_adapter_slots: - self._active_adapters.remove_oldest() - result = super().activate_adapter(prompt_adapter_id) - # We always touch to update the LRU cache order - self._active_adapters.touch(prompt_adapter_id) - return result - - def remove_oldest_adapter(self) -> bool: - if len(self._registered_adapters) > 0: - self._registered_adapters.remove_oldest() - return True - return False - - def pin_adapter(self, prompt_adapter_id: int) -> bool: - """Pin a PromptAdapterModel in the manager cache.""" - self._pin_prompt_adapter_in_cpu_cache(prompt_adapter_id) - self._pin_prompt_adapter_in_gpu_cache(prompt_adapter_id) - return True - - def _pin_prompt_adapter_in_cpu_cache(self, prompt_adapter_id: int): - try: - self._registered_adapters.pin(prompt_adapter_id) - except ValueError as err: - raise ValueError( - "Pinning failed. " - f"Prompt Adapter {prompt_adapter_id} is not registered." - ) from err - - def _pin_prompt_adapter_in_gpu_cache(self, prompt_adapter_id: int): - if prompt_adapter_id not in self._active_adapters: - # move adapter to gpu if not already active - self.activate_adapter(prompt_adapter_id) - self._active_adapters.pin(prompt_adapter_id) - - -def create_prompt_adapter_manager( - model: nn.Module, - max_num_seqs: int, - max_num_batched_tokens: int, - prompt_adapter_config: PromptAdapterConfig, - prompt_adapter_manager_cls: Type[ - PromptAdapterModelManager] = PromptAdapterModelManager, - **kwargs) -> PromptAdapterModelManager: - """Create a PromptAdapterModel for a given model.""" - prompt_adapter_manager = prompt_adapter_manager_cls( - model=model, - max_num_seqs=max_num_seqs, - max_num_batched_tokens=max_num_batched_tokens, - prompt_adapter_config=prompt_adapter_config, - **kwargs) - return prompt_adapter_manager diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py deleted file mode 100644 index 3ce50d0a26b..00000000000 --- a/vllm/prompt_adapter/request.py +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import msgspec - -from vllm.adapter_commons.request import AdapterRequest - - -class PromptAdapterRequest( - msgspec.Struct, - array_like=True, # type: ignore[call-arg] - omit_defaults=True, # type: ignore[call-arg] - frozen=True): # type: ignore[call-arg] - """ - Request for a Prompt adapter. - """ - __metaclass__ = AdapterRequest - - prompt_adapter_name: str - prompt_adapter_id: int - prompt_adapter_local_path: str - prompt_adapter_num_virtual_tokens: int - - def __hash__(self): - return super().__hash__() - - @property - def adapter_id(self): - return self.prompt_adapter_id - - @property - def name(self): - return self.prompt_adapter_name - - @property - def local_path(self): - return self.prompt_adapter_local_path diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py deleted file mode 100644 index ddd007868f6..00000000000 --- a/vllm/prompt_adapter/utils.py +++ /dev/null @@ -1,98 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420 - -import os -from typing import Optional - -import torch -from huggingface_hub import file_exists, hf_hub_download -from huggingface_hub.utils import EntryNotFoundError -from safetensors.torch import load_file as safe_load_file - -from vllm.platforms import current_platform - -WEIGHTS_NAME = "adapter_model.bin" -SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors" - - -# Get current device name based on available devices -def infer_device() -> str: - if current_platform.is_cuda_alike(): - return "cuda" - return "cpu" - - -def load_peft_weights(model_id: str, - device: Optional[str] = None, - **hf_hub_download_kwargs) -> dict: - r""" - A helper method to load the PEFT weights from the HuggingFace Hub or locally - - Args: - model_id (`str`): - The local path to the adapter weights or the name of the adapter to - load from the HuggingFace Hub. - device (`str`): - The device to load the weights onto. - hf_hub_download_kwargs (`dict`): - Additional arguments to pass to the `hf_hub_download` method when - loading from the HuggingFace Hub. - """ - path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) if - hf_hub_download_kwargs.get("subfolder") is not None else model_id) - - if device is None: - device = infer_device() - - if os.path.exists(os.path.join(path, SAFETENSORS_WEIGHTS_NAME)): - filename = os.path.join(path, SAFETENSORS_WEIGHTS_NAME) - use_safetensors = True - elif os.path.exists(os.path.join(path, WEIGHTS_NAME)): - filename = os.path.join(path, WEIGHTS_NAME) - use_safetensors = False - else: - token = hf_hub_download_kwargs.get("token") - if token is None: - token = hf_hub_download_kwargs.get("use_auth_token") - - hub_filename = (os.path.join(hf_hub_download_kwargs["subfolder"], - SAFETENSORS_WEIGHTS_NAME) - if hf_hub_download_kwargs.get("subfolder") is not None - else SAFETENSORS_WEIGHTS_NAME) - has_remote_safetensors_file = file_exists( - repo_id=model_id, - filename=hub_filename, - revision=hf_hub_download_kwargs.get("revision"), - repo_type=hf_hub_download_kwargs.get("repo_type"), - token=token, - ) - use_safetensors = has_remote_safetensors_file - - if has_remote_safetensors_file: - # Priority 1: load safetensors weights - filename = hf_hub_download( - model_id, - SAFETENSORS_WEIGHTS_NAME, - **hf_hub_download_kwargs, - ) - else: - try: - filename = hf_hub_download(model_id, WEIGHTS_NAME, - **hf_hub_download_kwargs) - except EntryNotFoundError: - raise ValueError( # noqa: B904 - f"Can't find weights for {model_id} in {model_id} or \ - in the Hugging Face Hub. " - f"Please check that the file {WEIGHTS_NAME} or \ - {SAFETENSORS_WEIGHTS_NAME} is present at {model_id}.") - - if use_safetensors: - adapters_weights = safe_load_file(filename, device=device) - else: - adapters_weights = torch.load(filename, - map_location=torch.device(device), - weights_only=True) - - return adapters_weights diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py deleted file mode 100644 index 56265de8087..00000000000 --- a/vllm/prompt_adapter/worker_manager.py +++ /dev/null @@ -1,179 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import logging -from typing import Any, Optional, Set, Type - -import torch - -from vllm.adapter_commons.utils import (add_adapter_worker, - apply_adapters_worker, - list_adapters_worker, - set_active_adapters_worker) -from vllm.adapter_commons.worker_manager import AbstractWorkerManager -from vllm.config import PromptAdapterConfig -from vllm.prompt_adapter.models import (LRUCachePromptAdapterModelManager, - PromptAdapterModel, - PromptAdapterModelManager, - create_prompt_adapter_manager) -from vllm.prompt_adapter.request import PromptAdapterRequest - -logger = logging.getLogger(__name__) - - -class WorkerPromptAdapterManager(AbstractWorkerManager): - """WorkerPromptAdapterManager that manages - prompt_adapter models on the worker side. - - Every request, the requested prompt_adapters will be - loaded (unless they are already loaded), - and every other prompt_adapter will be unloaded.""" - - _manager_cls: Type[PromptAdapterModelManager] = PromptAdapterModelManager - - def __init__( - self, - max_num_seqs: int, - max_num_batched_tokens: int, - device: torch.device, - prompt_adapter_config: PromptAdapterConfig, - prompt_adapter_model_cls: Type[PromptAdapterModel] = PromptAdapterModel - ): - self._adapter_manager: PromptAdapterModelManager - self.max_num_seqs = max_num_seqs - self.max_num_batched_tokens = max_num_batched_tokens - self._prompt_adapter_model_cls = prompt_adapter_model_cls - self.prompt_adapter_config = prompt_adapter_config - super().__init__(device) - - @property - def is_enabled(self) -> bool: - return True - - def create_prompt_adapter_manager( - self, - model: torch.nn.Module, - ) -> Any: - prompt_adapter_manager = create_prompt_adapter_manager( - model, - max_num_seqs=self.max_num_seqs, - max_num_batched_tokens=self.max_num_batched_tokens, - prompt_adapter_config=self.prompt_adapter_config, - prompt_adapter_manager_cls=self._manager_cls, - ) - self._adapter_manager = prompt_adapter_manager - return prompt_adapter_manager.model - - def _load_adapter( - self, prompt_adapter_request: PromptAdapterRequest - ) -> PromptAdapterModel: - try: - prompt_adapter = ( - self._prompt_adapter_model_cls.from_local_checkpoint( - prompt_adapter_request.prompt_adapter_local_path, - prompt_adapter_id=prompt_adapter_request.prompt_adapter_id, - num_virtual_tokens=prompt_adapter_request. - prompt_adapter_num_virtual_tokens, - config=self.prompt_adapter_config, - device=str(self.device), - )) - except Exception as e: - raise RuntimeError( - f"Loading prompt_adapter " - f"{prompt_adapter_request.prompt_adapter_local_path}" - f" failed") from e - return prompt_adapter - - def add_dummy_prompt_adapter( - self, prompt_adapter_request: PromptAdapterRequest) -> bool: - return True - - def pin_adapter(self, adapter_id: int) -> bool: - return self._adapter_manager.pin_adapter(adapter_id) - - def set_active_adapters(self, requests: Set[Any], - mapping: Optional[Any]) -> None: - set_active_adapters_worker(requests, mapping, self._apply_adapters, - self._adapter_manager.set_adapter_mapping) - - def add_adapter(self, adapter_request: Any) -> bool: - return add_adapter_worker(adapter_request, self.list_adapters, - self._load_adapter, - self._adapter_manager.add_adapter, - self._adapter_manager.activate_adapter) - - def _apply_adapters(self, adapter_requests: Set[Any]) -> None: - apply_adapters_worker(adapter_requests, self.list_adapters, - self._adapter_manager.adapter_slots, - self.remove_adapter, self.add_adapter) - - def remove_adapter(self, adapter_id: int) -> bool: - return self._adapter_manager.remove_adapter(adapter_id) - - def remove_all_adapters(self): - self._adapter_manager.remove_all_adapters() - - def list_adapters(self) -> Set[int]: - return list_adapters_worker(self._adapter_manager.list_adapters) - - -class LRUCacheWorkerPromptAdapterManager(WorkerPromptAdapterManager): - """WorkerPromptAdapterManager that manages - prompt_adapter models on the worker side. - - Uses an LRU Cache. Every request, the requested - prompt_adapters will be loaded (unless they are already loaded) - and least recently used prompt_adapters will - be unloaded if the cache is above capacity.""" - - _prompt_adapter_manager_cls: Type[ - LRUCachePromptAdapterModelManager] = LRUCachePromptAdapterModelManager - - def create_prompt_adapter_manager( - self, - model: torch.nn.Module, - ) -> Any: - prompt_adapter_manager = create_prompt_adapter_manager( - model, - max_num_seqs=self.max_num_seqs, - max_num_batched_tokens=self.max_num_batched_tokens, - prompt_adapter_config=self.prompt_adapter_config, - prompt_adapter_manager_cls=self._prompt_adapter_manager_cls) - self._adapter_manager: LRUCachePromptAdapterModelManager = ( - prompt_adapter_manager) - return prompt_adapter_manager.model - - def _apply_adapters( - self, prompt_adapter_requests: Set[PromptAdapterRequest]) -> None: - prompt_adapters_map = { - prompt_adapter_request.prompt_adapter_id: prompt_adapter_request - for prompt_adapter_request in prompt_adapter_requests - if prompt_adapter_request - } - if len(prompt_adapters_map - ) > self._adapter_manager.prompt_adapter_slots: - raise RuntimeError( - f"Number of requested prompt_adapters " - f"({len(prompt_adapters_map)}) is greater " - "than the number of GPU prompt_adapter slots " - f"({self._adapter_manager.prompt_adapter_slots}).") - for prompt_adapter in prompt_adapters_map.values(): - self.add_adapter(prompt_adapter) - - def add_adapter(self, - prompt_adapter_request: PromptAdapterRequest) -> bool: - if prompt_adapter_request.prompt_adapter_id not in self.list_adapters( - ): - # Remove before we load the new prompt_adapter to save memory - if len(self._adapter_manager) + 1 > self._adapter_manager.capacity: - self._adapter_manager.remove_oldest_adapter() - prompt_adapter = self._load_adapter(prompt_adapter_request) - loaded = self._adapter_manager.add_adapter(prompt_adapter) - else: - # If the prompt_adapter is already loaded, just touch it to - # update its position in the caches - loaded = self._adapter_manager.get_adapter( - prompt_adapter_request.prompt_adapter_id) is not None - self._adapter_manager.activate_adapter( - prompt_adapter_request.prompt_adapter_id) - return loaded diff --git a/vllm/sequence.py b/vllm/sequence.py index ffe890eb2da..ddeb89c87ac 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -19,7 +19,6 @@ from vllm.lora.request import LoRARequest from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict from vllm.pooling_params import PoolingParams -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import RequestOutputKind, SamplingParams VLLM_TOKEN_ID_ARRAY_TYPE = "l" @@ -466,7 +465,6 @@ class Sequence: block size used by the block manager and cache engine. eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM. lora_request: LoRA request. - prompt_adapter_request: Prompt Adapter request. """ def __init__( @@ -476,14 +474,12 @@ def __init__( block_size: int, eos_token_id: Optional[int] = None, lora_request: Optional[LoRARequest] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> None: self.seq_id = seq_id self.inputs = inputs self.block_size = block_size self.eos_token_id = eos_token_id self.lora_request = lora_request - self.prompt_adapter_request = prompt_adapter_request self.data = SequenceData.from_seqs( self.prompt_token_ids, @@ -545,11 +541,6 @@ def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 - @property - def prompt_adapter_id(self) -> int: - return self.prompt_adapter_request.prompt_adapter_id \ - if self.prompt_adapter_request else 0 - def get_output_text_to_return(self, buffer_length: int, delta: bool) -> str: """If delta is True, only new text since the last call to @@ -609,12 +600,12 @@ def extra_hash(self) -> Optional[int]: designed for prefix caching mode. The final sequence hash is determined by applying token_ids from the sequence's blocks. """ - if self.prompt_adapter_id == 0 and self.lora_int_id == 0: + if self.lora_int_id == 0: return None # NOTE: If there are additional factors influencing the block aside from # token_ids, include them as input parameters to the hash. - return hash((self.prompt_adapter_id, self.lora_int_id)) + return hash(self.lora_int_id) def num_hashed_tokens_of_block(self, logical_idx: int): return logical_idx * self.block_size + self.block_size @@ -715,7 +706,6 @@ class SequenceGroup: encoder_seq: Optional, the single encoder sequence. Should be None unless you are working with an encoder/decoder model. trace_headers: OpenTelemetry trace headers. - prompt_adapter_request: Prompt Adapter request. priority: User-defined priority of the request. draft_size: The number of speculative tokens plus one from the target model; equal to max number of tokens a step can generate @@ -733,7 +723,6 @@ def __init__(self, pooled_data: Optional[torch.Tensor] = None, encoder_seq: Optional[Sequence] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, draft_size: int = 1) -> None: self.request_id = request_id @@ -757,7 +746,6 @@ def __init__(self, self.state = SequenceGroupState() self.pooling_params = pooling_params self.pooled_data = pooled_data - self.prompt_adapter_request = prompt_adapter_request self.encoder_seq = encoder_seq self.trace_headers = trace_headers self.priority = priority @@ -812,16 +800,6 @@ def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 - @property - def prompt_adapter_id(self) -> int: - return self.prompt_adapter_request.prompt_adapter_id \ - if self.prompt_adapter_request else 0 - - @property - def prompt_adapter_num_virtual_tokens(self) -> int: - return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens\ - if self.prompt_adapter_request else 0 - def init_multi_step(self, num_steps: int) -> None: self.state.num_steps = num_steps self.state.current_step = 0 @@ -1021,7 +999,6 @@ class SequenceGroupMetadata( (SequenceGroup.encoder_seq). Should be None unless you are working with an encoder/decoder model. - prompt_adapter_request: Prompt Adapter request. """ request_id: str @@ -1040,7 +1017,6 @@ class SequenceGroupMetadata( multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None encoder_seq_data: Optional[SequenceData] = None cross_block_table: Optional[list[int]] = None - prompt_adapter_request: Optional[PromptAdapterRequest] = None token_chunk_size: Optional[int] = None ### Stateful fields that are lazily defined. ### @@ -1062,16 +1038,6 @@ def __post_init__(self): def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 - @property - def prompt_adapter_id(self) -> int: - return self.prompt_adapter_request.prompt_adapter_id \ - if self.prompt_adapter_request else 0 - - @property - def prompt_adapter_num_virtual_tokens(self) -> int: - return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens \ - if self.prompt_adapter_request else 0 - # Multi-Step Chunked-Prefill property @property def is_single_step_prompt(self) -> bool: @@ -1524,7 +1490,6 @@ def add_request(request_id: str, engine, params, **kwargs): pooled_data=seq_group.pooled_data, encoder_seq=seq_group.encoder_seq, trace_headers=seq_group.trace_headers, - prompt_adapter_request=seq_group.prompt_adapter_request, priority=seq_group.priority, ) diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 96646ec9471..876ae3b762a 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -137,7 +137,6 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest): 1. Only decodes 2. Only flash-attn 3. No LORA - 4. No prompt_adapter_config """ if not allow_gpu_advance_step: return False @@ -152,11 +151,7 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest): return False # TODO: Add support for LORA - if self.lora_config: - return False - - # TODO: Add soft-tuning prompt adapter support - return not self.prompt_adapter_config + return not self.lora_config def set_indices_of_seq_with_bonus_tokens(self, indices_of_seq_with_bonus_tokens): @@ -199,9 +194,6 @@ def execute_model( # Sanity if self.lora_config is not None: raise ValueError("TP1DraftModelRunner has no support for LORA") - if self.prompt_adapter_config is not None: - raise ValueError("TP1DraftModelRunner has no support for " - "prompt_adapter_config") if model_input.inputs_embeds is not None: raise ValueError("TP1DraftModelRunner has no support for " "inputs_embeds") @@ -216,13 +208,6 @@ def execute_model( self.set_active_loras(model_input.lora_requests, model_input.lora_mapping) - if self.prompt_adapter_config: - assert model_input.prompt_adapter_requests is not None - assert model_input.prompt_adapter_mapping is not None - self.set_active_prompt_adapters( - model_input.prompt_adapter_requests, - model_input.prompt_adapter_mapping) - self.attn_state.begin_forward(model_input) # Detect exec mode diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index bfdbd682464..15753a93653 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -127,10 +127,6 @@ "backends currently supported with encoder/" "decoder models.") -STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER = ("Prompt adapters are not " - "currently supported with encoder/" - "decoder models.") - # Efficiently import all enc/dec error strings # rather than having to import all of the above STR_NOT_IMPL_ENC_DEC_ERR_STRS = { @@ -144,7 +140,6 @@ "STR_NOT_IMPL_ENC_DEC_MM": STR_NOT_IMPL_ENC_DEC_MM, "STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC, "STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND, - "STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER": STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER, } # Constants related to forcing the attention backend selection diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 3754570dfaa..ef472b62b43 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -19,7 +19,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) @@ -219,7 +218,6 @@ async def add_request( lora_request: Optional[LoRARequest] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, data_parallel_rank: Optional[int] = None, ) -> RequestOutputCollector: @@ -236,8 +234,7 @@ async def add_request( # Convert Input --> Request. prompt_str, request = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, - tokenization_kwargs, trace_headers, prompt_adapter_request, - priority, data_parallel_rank) + tokenization_kwargs, trace_headers, priority, data_parallel_rank) if is_pooling or params.n == 1: await self._add_request(request, prompt_str, None, 0, queue) @@ -281,7 +278,6 @@ async def generate( request_id: str, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, data_parallel_rank: Optional[int] = None, ) -> AsyncGenerator[RequestOutput, None]: @@ -312,7 +308,6 @@ async def generate( sampling_params, lora_request=lora_request, trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request, priority=priority, data_parallel_rank=data_parallel_rank, ) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index a2328c37ba0..bf5167f870b 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -17,7 +17,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import ( TokenizerGroup, init_tokenizer_from_configs) @@ -189,7 +188,6 @@ def add_request( lora_request: Optional[LoRARequest] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> None: # Validate the request_id type. @@ -200,8 +198,7 @@ def add_request( # Process raw inputs into the request. prompt_str, request = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, - tokenization_kwargs, trace_headers, prompt_adapter_request, - priority) + tokenization_kwargs, trace_headers, priority) n = params.n if isinstance(params, SamplingParams) else 1 diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 9fc52543efd..abac252a51f 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -16,7 +16,6 @@ from vllm.multimodal.processing import EncDecMultiModalProcessor from vllm.multimodal.utils import merge_and_sort_multimodal_metadata from vllm.pooling_params import PoolingParams -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.v1.engine import EngineCoreRequest @@ -221,7 +220,6 @@ def process_inputs( lora_request: Optional[LoRARequest] = None, tokenization_kwargs: Optional[dict[str, Any]] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, data_parallel_rank: Optional[int] = None, ) -> tuple[Optional[str], EngineCoreRequest]: @@ -232,8 +230,6 @@ def process_inputs( self._validate_params(params, lora_request) if trace_headers is not None: raise ValueError("V1 does not support tracing yet.") - if prompt_adapter_request is not None: - raise ValueError("V1 does not support prompt_adapter_request.") data_parallel_size = self.vllm_config.parallel_config.data_parallel_size if data_parallel_rank is not None and not (0 <= data_parallel_rank < @@ -248,12 +244,10 @@ def process_inputs( # 1. Tokenize text prompt, with LoRA request if one exists. # 2. For multimodal models with a merged preprocessor, preprocess # multimodal data and expand prompt token ids accordingly. - # 3. Apply prompt adapter to prompt token ids if one exists. processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess( prompt, tokenization_kwargs=tokenization_kwargs, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, return_mm_hashes=self.use_hash, ) from vllm.platforms import current_platform diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 6b40cf6fd36..364c5bb5c16 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -366,8 +366,6 @@ def report_usage_stats( # Feature flags "enable_lora": bool(vllm_config.lora_config), - "enable_prompt_adapter": - bool(vllm_config.prompt_adapter_config), "enable_prefix_caching": vllm_config.cache_config.enable_prefix_caching, "enforce_eager": diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5a26e88db1f..a7b986fa2aa 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -103,7 +103,6 @@ def __init__( self.parallel_config = vllm_config.parallel_config self.scheduler_config = vllm_config.scheduler_config self.speculative_config = vllm_config.speculative_config - self.prompt_adapter_config = vllm_config.prompt_adapter_config self.observability_config = vllm_config.observability_config from vllm.model_executor.models.utils import set_cpu_offload_max_bytes diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index f5f26d8fff9..f4bf8a281ee 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -112,7 +112,6 @@ def __init__( self.original_parallel_config = original_parallel_config self.scheduler_config = vllm_config.scheduler_config self.speculative_config = vllm_config.speculative_config - self.prompt_adapter_config = vllm_config.prompt_adapter_config self.observability_config = vllm_config.observability_config self.device_config = vllm_config.device_config diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index ade4d082116..913bc4f03d9 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -60,7 +60,6 @@ def __init__( self.scheduler_config = vllm_config.scheduler_config self.device_config = vllm_config.device_config self.speculative_config = vllm_config.speculative_config - self.prompt_adapter_config = vllm_config.prompt_adapter_config self.observability_config = vllm_config.observability_config self.parallel_config.rank = rank diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 8d92edc5b38..cb5d5664ab5 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -91,10 +91,9 @@ def __init__( ''' EncoderDecoderModelRunner constructor. - `lora_config` and `prompt_adapter_config` are - unused (since these features are not yet supported for encoder/decoder - models) but these arguments are present here for compatibility with - the base-class constructor. + `lora_config` is unused (since these features are not yet supported + for encoder/decoder models) but these arguments are present here for + compatibility with the base-class constructor. ''' self._maybe_force_supported_attention_backend() diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 560110df0a3..4916a2a201f 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -24,7 +24,6 @@ from vllm.model_executor import set_random_seed from vllm.model_executor.layers.sampler import SamplerOutput from vllm.platforms import current_platform -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest from vllm.utils import bind_kv_cache from vllm.worker.cache_engine import CacheEngine @@ -367,23 +366,6 @@ def pin_lora(self, lora_id: int) -> bool: def list_loras(self) -> Set[int]: return self.model_runner.list_loras() - def add_prompt_adapter( - self, prompt_adapter_request: PromptAdapterRequest) -> bool: - raise NotImplementedError( - "Prompt Adapter is not implemented for HPU backend.") - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Prompt Adapter is not implemented for HPU backend.") - - def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError( - "Prompt Adapter is not implemented for HPU backend.") - - def list_prompt_adapters(self) -> Set[int]: - raise NotImplementedError( - "Prompt Adapter is not implemented for HPU backend.") - def shutdown_inc(self): self.model_runner.shutdown_inc() diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 82db6617ba5..51efb563acf 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -45,10 +45,6 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalKwargs, MultiModalPlaceholderMap, MultiModalRegistry) -from vllm.prompt_adapter.layers import PromptAdapterMapping -from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.prompt_adapter.worker_manager import ( - LRUCacheWorkerPromptAdapterManager) from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache, @@ -95,8 +91,6 @@ class ModelInputForGPU(ModelRunnerInputBase): lora_mapping: Optional["LoRAMapping"] = None lora_requests: Optional[Set[LoRARequest]] = None attn_metadata: Optional["AttentionMetadata"] = None - prompt_adapter_mapping: Optional[PromptAdapterMapping] = None - prompt_adapter_requests: Optional[Set[PromptAdapterRequest]] = None multi_modal_kwargs: Optional[BatchedTensorInputs] = None request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None finished_requests_ids: Optional[List[str]] = None @@ -113,8 +107,6 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: "lora_requests": self.lora_requests, "lora_mapping": self.lora_mapping, "multi_modal_kwargs": self.multi_modal_kwargs, - "prompt_adapter_mapping": self.prompt_adapter_mapping, - "prompt_adapter_requests": self.prompt_adapter_requests, "virtual_engine": self.virtual_engine, "request_ids_to_seq_ids": self.request_ids_to_seq_ids, "finished_requests_ids": self.finished_requests_ids, @@ -164,8 +156,6 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: "lora_requests": self.lora_requests, "lora_mapping": self.lora_mapping, "multi_modal_kwargs": self.multi_modal_kwargs, - "prompt_adapter_mapping": self.prompt_adapter_mapping, - "prompt_adapter_requests": self.prompt_adapter_requests, "virtual_engine": self.virtual_engine, "request_ids_to_seq_ids": self.request_ids_to_seq_ids, "finished_requests_ids": self.finished_requests_ids, @@ -212,8 +202,6 @@ def simple_reinit(self): self.lora_index_mapping.clear() # type: ignore self.lora_prompt_mapping.clear() # type: ignore self.lora_requests.clear() # type: ignore - self.prompt_adapter_index_mapping.clear() # type: ignore - self.prompt_adapter_prompt_mapping.clear() # type: ignore def __init__( self, @@ -252,11 +240,6 @@ def __init__( lora_prompt_mapping: Optional[List[List[int]]] = None, lora_requests: Optional[Set[LoRARequest]] = None, - # Prompt adapter inputs. - prompt_adapter_index_mapping: Optional[List[int]] = None, - prompt_adapter_prompt_mapping: Optional[List[int]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - # Multi-modal inputs. multi_modal_kwargs: Optional[MultiModalKwargs] = None, multi_modal_placeholder_maps: Optional[Dict[ @@ -360,18 +343,6 @@ def __init__( else: self.lora_requests.clear() - if prompt_adapter_index_mapping: - self.prompt_adapter_index_mapping = \ - prompt_adapter_index_mapping - else: - self.prompt_adapter_index_mapping.clear() - - if prompt_adapter_prompt_mapping: - self.prompt_adapter_prompt_mapping = \ - prompt_adapter_prompt_mapping - else: - self.prompt_adapter_prompt_mapping.clear() - else: self.input_tokens = input_tokens or [] self.inputs_embeds = inputs_embeds @@ -390,12 +361,6 @@ def __init__( self.lora_prompt_mapping = lora_prompt_mapping or [] self.lora_requests = lora_requests or set() - self.prompt_adapter_index_mapping = ( - prompt_adapter_index_mapping or []) - self.prompt_adapter_prompt_mapping = ( - prompt_adapter_prompt_mapping or []) - - self.prompt_adapter_request = prompt_adapter_request self.multi_modal_kwargs = multi_modal_kwargs self.multi_modal_placeholder_maps = multi_modal_placeholder_maps self.prefix_cache_hit = prefix_cache_hit @@ -485,7 +450,6 @@ def __init__(self, # Compute functions for each sequence group. # WARNING: The order of the functions matters! self.per_seq_group_compute_fns = [ - self._compute_prompt_adapter_input, self._compute_multi_modal_input, ] @@ -496,8 +460,6 @@ def __init__(self, self.sliding_window = self.runner.sliding_window self.block_size = self.runner.block_size self.enable_lora = self.runner.lora_config is not None - self.enable_prompt_adapter = (self.runner.prompt_adapter_config - is not None) # Attention metadata inputs. if self.attn_backend is not None: @@ -693,34 +655,6 @@ def _compute_lora_input(self, inter_data: InterDataForSeqGroup, else: inter_data.lora_prompt_mapping.append([]) - def _compute_prompt_adapter_input( - self, inter_data: InterDataForSeqGroup, - seq_group_metadata: SequenceGroupMetadata): - """If prompt adapter is enabled, compute index and prompt mapping. - """ - # Note that when is_prompt=True, we expect only one sequence - # in the group. - if not self.enable_prompt_adapter: - return - - prompt_adapter_id = seq_group_metadata.prompt_adapter_id - if prompt_adapter_id <= 0 or not inter_data.is_prompt: - return - - # We expect only one sequence in the group when is_prompt=True. - assert inter_data.n_seqs == 1 - query_len = inter_data.query_lens[0] - inter_data.prompt_adapter_request = ( - seq_group_metadata.prompt_adapter_request) - - num_tokens = seq_group_metadata.prompt_adapter_num_virtual_tokens - inter_data.prompt_adapter_index_mapping = [ - prompt_adapter_id - ] * num_tokens + [0] * (query_len - num_tokens) - inter_data.prompt_adapter_prompt_mapping = [prompt_adapter_id] * ( - query_len if seq_group_metadata.sampling_params - and seq_group_metadata.sampling_params.prompt_logprobs else 1) - def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, seq_group_metadata: SequenceGroupMetadata): """If multi-modal data is given, add it to the input.""" @@ -1009,29 +943,6 @@ def build(self) -> ModelInputForGPU: prompt_mapping=lora_prompt_mapping, is_prefill=not self.decode_only)) - # Prompt adapter data. - prompt_adapter_requests: Set[PromptAdapterRequest] = set() - prompt_adapter_mapping = None - if self.enable_prompt_adapter: - prompt_adapter_requests = set( - data.prompt_adapter_request for data in self.inter_data_list - if data.prompt_adapter_request is not None) - prompt_adapter_index_mapping = flatten_2d_lists([ - inter_data.prompt_adapter_index_mapping - for inter_data in self.inter_data_list - ]) - if cuda_graph_pad_size: - prompt_adapter_index_mapping.extend( - itertools.repeat(0, cuda_graph_pad_size)) - prompt_adapter_prompt_mapping = flatten_2d_lists([ - inter_data.prompt_adapter_prompt_mapping - for inter_data in self.inter_data_list - ]) - prompt_adapter_mapping = PromptAdapterMapping( - prompt_adapter_index_mapping, - prompt_adapter_prompt_mapping, - ) - # Multi-modal data. multi_modal_kwargs_list = [ data.multi_modal_kwargs for data in self.inter_data_list @@ -1051,9 +962,7 @@ def build(self) -> ModelInputForGPU: lora_requests=lora_requests, multi_modal_kwargs=multi_modal_kwargs, request_ids_to_seq_ids=request_ids_to_seq_ids, - finished_requests_ids=self.finished_requests_ids, - prompt_adapter_mapping=prompt_adapter_mapping, - prompt_adapter_requests=prompt_adapter_requests) + finished_requests_ids=self.finished_requests_ids) class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): @@ -1144,7 +1053,6 @@ def __init__( self.model: nn.Module # Set after load_model # Set after load_model. self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None - self.prompt_adapter_manager: LRUCacheWorkerPromptAdapterManager = None self.sampler = get_sampler() set_cpu_offload_max_bytes( @@ -1203,14 +1111,7 @@ def load_model(self) -> None: logger.info("Model loading took %.4f GiB and %.6f seconds", self.model_memory_usage / GiB_bytes, time_after_load - time_before_load) - if self.prompt_adapter_config: - self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager( - self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens, self.device, - self.prompt_adapter_config) - self.model = ( - self.prompt_adapter_manager.create_prompt_adapter_manager( - self.model)) + if self.vllm_config.compilation_config.level ==\ CompilationLevel.DYNAMO_AS_IS and supports_dynamo(): @@ -1461,40 +1362,6 @@ def list_loras(self) -> Set[int]: raise RuntimeError("LoRA is not enabled.") return self.lora_manager.list_adapters() - def remove_all_prompt_adapters(self): - if not self.prompt_adapter_manager: - raise RuntimeError("PromptAdapter is not enabled.") - self.prompt_adapter_manager.remove_all_adapters() - - def set_active_prompt_adapters( - self, prompt_adapter_requests: Set[PromptAdapterRequest], - prompt_adapter_mapping: PromptAdapterMapping) -> None: - if not self.prompt_adapter_manager: - raise RuntimeError("PromptAdapter is not enabled.") - self.prompt_adapter_manager.set_active_adapters( - prompt_adapter_requests, prompt_adapter_mapping) - - def add_prompt_adapter( - self, prompt_adapter_request: PromptAdapterRequest) -> bool: - if not self.prompt_adapter_manager: - raise RuntimeError("PromptAdapter is not enabled.") - return self.prompt_adapter_manager.add_adapter(prompt_adapter_request) - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - if not self.prompt_adapter_manager: - raise RuntimeError("PromptAdapter is not enabled.") - return self.prompt_adapter_manager.remove_adapter(prompt_adapter_id) - - def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - if not self.prompt_adapter_manager: - raise RuntimeError("PromptAdapter is not enabled.") - return self.prompt_adapter_manager.pin_adapter(prompt_adapter_id) - - def list_prompt_adapters(self) -> Set[int]: - if not self.prompt_adapter_manager: - raise RuntimeError("PromptAdapter is not enabled.") - return self.prompt_adapter_manager.list_adapters() - @torch.inference_mode() def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: """Cuda graph capture a model. @@ -1603,13 +1470,6 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: self.set_active_loras(set([dummy_lora_request]), lora_mapping) - if self.prompt_adapter_config: - prompt_adapter_mapping = PromptAdapterMapping( - [-1] * batch_size, - [-1] * batch_size, - ) - self.set_active_prompt_adapters( - set(), prompt_adapter_mapping) graph_runner = CUDAGraphRunner( self.model, self.attn_backend.get_name(), self.attn_state.graph_clone(batch_size), @@ -1770,13 +1630,6 @@ def execute_model( self.set_active_loras(model_input.lora_requests, model_input.lora_mapping) - if self.prompt_adapter_config: - assert model_input.prompt_adapter_requests is not None - assert model_input.prompt_adapter_mapping is not None - self.set_active_prompt_adapters( - model_input.prompt_adapter_requests, - model_input.prompt_adapter_mapping) - self.attn_state.begin_forward(model_input) # Currently cuda graph is only supported by the decode phase. diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index d567ce4a6e7..2b31fe52483 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -188,7 +188,6 @@ def __init__( self.scheduler_config = vllm_config.scheduler_config self.device_config = vllm_config.device_config self.speculative_config = vllm_config.speculative_config - self.prompt_adapter_config = vllm_config.prompt_adapter_config self.observability_config = vllm_config.observability_config # Map of request_id -> generator used for seeded random sampling diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 0680e60b52a..2aa910bdff6 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -288,9 +288,6 @@ def maybe_advance_frozen_model_input(self, device: str, pin_memory: bool): assert fmi.lora_requests is not None assert len(fmi.lora_requests) == 0 assert fmi.attn_metadata is not None - assert fmi.prompt_adapter_mapping is None - assert fmi.prompt_adapter_requests is not None - assert len(fmi.prompt_adapter_requests) == 0 assert fmi.multi_modal_kwargs is not None assert len(fmi.multi_modal_kwargs) == 0 diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index f80955f71a5..01bbdf645ae 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -63,13 +63,6 @@ def execute_model( self.set_active_loras(model_input.lora_requests, model_input.lora_mapping) - if self.prompt_adapter_config: - assert model_input.prompt_adapter_requests is not None - assert model_input.prompt_adapter_mapping is not None - self.set_active_prompt_adapters( - model_input.prompt_adapter_requests, - model_input.prompt_adapter_mapping) - # Currently cuda graph is only supported by the decode phase. assert model_input.attn_metadata is not None prefill_meta = model_input.attn_metadata.prefill_metadata diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index 1a5f62cb3c4..512a1dca737 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -47,7 +47,3 @@ def assert_enc_dec_mr_supported_scenario( if enc_dec_mr.scheduler_config.num_lookahead_slots > 0: raise NotImplementedError( STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SPEC_DEC']) - - if enc_dec_mr.prompt_adapter_config is not None: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_ERR_STRS[ - 'STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER']) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 21e684a3fb5..b7933a848bb 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -21,7 +21,6 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.platforms import current_platform -from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, SequenceGroupMetadata, SequenceGroupMetadataDelta) from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache, @@ -490,19 +489,6 @@ def pin_lora(self, lora_id: int) -> bool: def list_loras(self) -> Set[int]: return self.model_runner.list_loras() - def add_prompt_adapter( - self, prompt_adapter_request: PromptAdapterRequest) -> bool: - return self.model_runner.add_prompt_adapter(prompt_adapter_request) - - def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - return self.model_runner.remove_lora(prompt_adapter_id) - - def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - return self.model_runner.pin_prompt_adapter(prompt_adapter_id) - - def list_prompt_adapters(self) -> Set[int]: - return self.model_runner.list_prompt_adapters() - @property def max_model_len(self) -> int: return self.model_config.max_model_len diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index c382b29ad19..515ca91a4de 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -49,7 +49,6 @@ def __init__( self.scheduler_config = vllm_config.scheduler_config self.device_config = vllm_config.device_config self.speculative_config = vllm_config.speculative_config - self.prompt_adapter_config = vllm_config.prompt_adapter_config self.observability_config = vllm_config.observability_config self.kv_transfer_config = vllm_config.kv_transfer_config self.compilation_config = vllm_config.compilation_config From a1a1ac806b9637836a5e02747a6f4e31e8d21925 Mon Sep 17 00:00:00 2001 From: mgoin Date: Mon, 7 Jul 2025 18:45:54 +0000 Subject: [PATCH 2/4] Remove from feature matrix Signed-off-by: mgoin --- docs/features/compatibility_matrix.md | 34 +++++++++++++-------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md index 4f475ee4db8..44822bc232e 100644 --- a/docs/features/compatibility_matrix.md +++ b/docs/features/compatibility_matrix.md @@ -37,23 +37,22 @@ th:not(:first-child) { } -| Feature | [CP][chunked-prefill] | [APC][automatic-prefix-caching] | [LoRA][lora-adapter] | prmpt adptr | [SD][spec-decode] | CUDA graph | pooling | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search | -|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| -| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | | -| [APC][automatic-prefix-caching] | ✅ | ✅ | | | | | | | | | | | | | | -| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | | | | | | | | | | | | | -| prmpt adptr | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | | -| [SD][spec-decode] | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | | -| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | -| pooling | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | | -| enc-dec | ❌ | [❌](gh-issue:7366) | ❌ | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | -| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | -| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | -| async output | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | -| multi-step | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | -| mm | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | -| best-of | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | -| beam-search | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | +| Feature | [CP][chunked-prefill] | [APC][automatic-prefix-caching] | [LoRA][lora-adapter] | [SD][spec-decode] | CUDA graph | pooling | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search | +|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | +| [APC][automatic-prefix-caching] | ✅ | ✅ | | | | | | | | | | | | | +| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | | | | | | | | | | | | +| [SD][spec-decode] | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | +| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | +| pooling | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | | +| enc-dec | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | +| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | +| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | +| async output | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | +| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | +| mm | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | +| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | +| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | [](){ #feature-x-hardware } @@ -64,7 +63,6 @@ th:not(:first-child) { | [CP][chunked-prefill] | [❌](gh-issue:2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [APC][automatic-prefix-caching] | [❌](gh-issue:3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [LoRA][lora-adapter] | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| prmpt adptr | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8475) | ✅ | ❌ | | [SD][spec-decode] | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | | pooling | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ❌ | From 77b65be54be94242c990055659120401b4d0ec0d Mon Sep 17 00:00:00 2001 From: mgoin Date: Mon, 7 Jul 2025 18:54:37 +0000 Subject: [PATCH 3/4] Fix test_return_tokens_as_ids Signed-off-by: mgoin --- tests/entrypoints/openai/test_return_tokens_as_ids.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index 099062e55c7..af58fbd4b36 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -13,7 +13,6 @@ from .test_completion import default_server_args # noqa: F401 from .test_completion import zephyr_lora_added_tokens_files # noqa: F401 from .test_completion import zephyr_lora_files # noqa: F401 -from .test_completion import zephyr_pa_files # noqa: F401 from .test_completion import MODEL_NAME From 6e247a459588a58889ab684f17af29e17f33f00b Mon Sep 17 00:00:00 2001 From: mgoin Date: Tue, 15 Jul 2025 23:17:41 -0400 Subject: [PATCH 4/4] Add deprecation warning Signed-off-by: mgoin --- vllm/engine/arg_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 365f1e55303..c522fb7b6ba 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -845,6 +845,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument('--disable-log-stats', action='store_true', help='Disable logging statistics.') + parser.add_argument('--enable-prompt-adapter', + action='store_true', + deprecated=True, + help='[DEPRECATED] Prompt adapter has been ' + 'removed. Setting this flag to True or False' + ' has no effect on vLLM behavior.') return parser