diff --git a/docs/api/README.md b/docs/api/README.md
index 245c925f7f5..db4dab0ae53 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -14,7 +14,6 @@ API documentation for vLLM's configuration classes.
- [vllm.config.DeviceConfig][]
- [vllm.config.SpeculativeConfig][]
- [vllm.config.LoRAConfig][]
-- [vllm.config.PromptAdapterConfig][]
- [vllm.config.MultiModalConfig][]
- [vllm.config.PoolerConfig][]
- [vllm.config.DecodingConfig][]
diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md
index fdd75bfe33d..8be1585f8e7 100644
--- a/docs/features/compatibility_matrix.md
+++ b/docs/features/compatibility_matrix.md
@@ -34,23 +34,22 @@ th:not(:first-child) {
}
-| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | prmpt adptr | [SD](spec_decode.md) | CUDA graph | pooling | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search |
-|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
+| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | pooling | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search |
+|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
-| prmpt adptr | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | |
-| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | |
-| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
-| pooling | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | |
-| enc-dec | ❌ | [❌](gh-issue:7366) | ❌ | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
-| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
-| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
-| async output | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
-| multi-step | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
-| mm | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
-| best-of | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | |
-| beam-search | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ |
+| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | |
+| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
+| pooling | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | |
+| enc-dec | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
+| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
+| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
+| async output | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
+| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
+| mm | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
+| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | |
+| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ |
[](){ #feature-x-hardware }
@@ -59,10 +58,9 @@ th:not(:first-child) {
| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | TPU |
|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
| [CP][chunked-prefill] | [❌](gh-issue:2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [APC](automatic_prefix_caching.md) | [❌](gh-issue:3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| prmpt adptr | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8475) | ✅ | ❌ |
-| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+| [APC](automatic_prefix_caching.md) | [❌](gh-issue:3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ |
| pooling | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ❌ |
| enc-dec | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
diff --git a/pyproject.toml b/pyproject.toml
index 340abb38565..a85cf243386 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,7 +72,6 @@ line-length = 80
"vllm/core/**/*.py" = ["UP006", "UP035"]
"vllm/engine/**/*.py" = ["UP006", "UP035"]
"vllm/executor/**/*.py" = ["UP006", "UP035"]
-"vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"]
"vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
"vllm/worker/**/*.py" = ["UP006", "UP035"]
# Python 3.8 typing - skip utils for ROCm
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index df9586ee84d..eaa7f877d5b 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -26,10 +26,6 @@
# technically these adapters use a different base model,
# but we're not testing generation quality here
LORA_NAME = "typeof/zephyr-7b-beta-lora"
-PA_NAME = "swapnilbp/llama_tweet_ptune"
-# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
-# need to change to match the prompt adapter
-PA_NUM_VIRTUAL_TOKENS = 8
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
@@ -56,13 +52,7 @@ def zephyr_lora_added_tokens_files(zephyr_lora_files):
@pytest.fixture(scope="module")
-def zephyr_pa_files():
- return snapshot_download(repo_id=PA_NAME)
-
-
-@pytest.fixture(scope="module")
-def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
- zephyr_pa_files):
+def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
@@ -81,15 +71,6 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
"64",
"--max-cpu-loras",
"2",
- # pa config
- "--enable-prompt-adapter",
- "--prompt-adapters",
- f"zephyr-pa={zephyr_pa_files}",
- f"zephyr-pa2={zephyr_pa_files}",
- "--max-prompt-adapters",
- "2",
- "--max-prompt-adapter-token",
- "128",
]
@@ -110,14 +91,11 @@ async def client(server):
@pytest.mark.asyncio
@pytest.mark.parametrize(
- # first test base model, then test loras, then test prompt adapters
- "model_name,num_virtual_tokens",
- [(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0),
- ("zephyr-pa", PA_NUM_VIRTUAL_TOKENS),
- ("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)],
+ # first test base model, then test loras
+ "model_name",
+ [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
)
-async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
- num_virtual_tokens: int):
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
completion = await client.completions.create(model=model_name,
prompt="Hello, my name is",
max_tokens=5,
@@ -130,9 +108,7 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
assert len(choice.text) >= 5
assert choice.finish_reason == "length"
assert completion.usage == openai.types.CompletionUsage(
- completion_tokens=5,
- prompt_tokens=6 + num_virtual_tokens,
- total_tokens=11 + num_virtual_tokens)
+ completion_tokens=5, prompt_tokens=6, total_tokens=11)
# test using token IDs
completion = await client.completions.create(
@@ -175,9 +151,9 @@ async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
@pytest.mark.asyncio
@pytest.mark.parametrize(
- # first test base model, then test loras, then test prompt adapters
+ # first test base model, then test loras
"model_name",
- [MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"],
+ [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
)
async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs
@@ -194,9 +170,9 @@ async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize(
- # just test 1 lora and 1 pa hereafter
+ # just test 1 lora
"model_name",
- [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+ [MODEL_NAME, "zephyr-lora"],
)
async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs
@@ -217,7 +193,7 @@ async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
- [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+ [MODEL_NAME, "zephyr-lora"],
)
async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs
@@ -238,7 +214,7 @@ async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
- [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+ [MODEL_NAME, "zephyr-lora"],
)
async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
model_name: str):
@@ -314,7 +290,7 @@ async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
- [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+ [MODEL_NAME, "zephyr-lora"],
)
async def test_completion_streaming(client: openai.AsyncOpenAI,
model_name: str):
@@ -348,7 +324,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
- [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+ [MODEL_NAME, "zephyr-lora"],
)
async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
"""Streaming for parallel sampling.
@@ -382,7 +358,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
- [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+ [MODEL_NAME, "zephyr-lora"],
)
async def test_completion_stream_options(client: openai.AsyncOpenAI,
model_name: str):
@@ -519,7 +495,7 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
- [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
+ [MODEL_NAME, "zephyr-lora"],
)
async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
# test both text and token IDs
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index 099062e55c7..af58fbd4b36 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -13,7 +13,6 @@
from .test_completion import default_server_args # noqa: F401
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
from .test_completion import zephyr_lora_files # noqa: F401
-from .test_completion import zephyr_pa_files # noqa: F401
from .test_completion import MODEL_NAME
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index 5f334c754a3..c3b458d717f 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -32,8 +32,7 @@ async def _async_serving_models_init() -> OpenAIServingModels:
serving_models = OpenAIServingModels(engine_client=mock_engine_client,
base_model_paths=BASE_MODEL_PATHS,
model_config=mock_model_config,
- lora_modules=None,
- prompt_adapters=None)
+ lora_modules=None)
await serving_models.init_static_loras()
return serving_models
diff --git a/tests/prompt_adapter/test_bloom.py b/tests/prompt_adapter/test_bloom.py
deleted file mode 100644
index 2b603fe8f02..00000000000
--- a/tests/prompt_adapter/test_bloom.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-import vllm
-from vllm.prompt_adapter.request import PromptAdapterRequest
-
-MODEL_PATH = "bigscience/bloomz-560m"
-PA_PATH = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM'
-
-
-def do_sample(llm, pa_name: str, pa_id: int):
-
- prompts = [
- "Tweet text : @nationalgridus I have no water and the bill is \
- current and paid. Can you do something about this? Label : ",
- "Tweet text : @nationalgridus Looks good thanks! Label : "
- ]
- sampling_params = vllm.SamplingParams(temperature=0.0,
- max_tokens=3,
- stop_token_ids=[3])
-
- outputs = llm.generate(prompts,
- sampling_params,
- prompt_adapter_request=PromptAdapterRequest(
- pa_name, pa_id, PA_PATH, 8) if pa_id else None)
-
- # Print the outputs.
- generated_texts = []
- for output in outputs:
- prompt = output.prompt
- generated_text = output.outputs[0].text.strip()
- generated_texts.append(generated_text)
- print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
- return generated_texts
-
-
-@pytest.mark.parametrize("enforce_eager", [True, False])
-def test_twitter_prompt_adapter(enforce_eager: bool):
- llm = vllm.LLM(MODEL_PATH,
- enforce_eager=enforce_eager,
- enable_prompt_adapter=True,
- max_prompt_adapter_token=8)
-
- expected_output = ['complaint', 'no complaint']
-
- assert do_sample(llm, "twitter_pa", pa_id=1) == expected_output
diff --git a/tests/prompt_adapter/test_multi_adapter_inference.py b/tests/prompt_adapter/test_multi_adapter_inference.py
deleted file mode 100644
index 4f273afb4e3..00000000000
--- a/tests/prompt_adapter/test_multi_adapter_inference.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm import EngineArgs, LLMEngine, SamplingParams
-from vllm.prompt_adapter.request import PromptAdapterRequest
-
-MODEL_PATH = "bigscience/bloomz-560m"
-pa_path = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM'
-pa_path2 = 'swapnilbp/angry_tweet_ptune'
-
-
-def do_sample(engine):
-
- prompts = [
- ("Tweet text: I have complaints! Label: ",
- SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
- PromptAdapterRequest("hate_speech", 1, pa_path2, 8)),
- ("Tweet text: I have no problems Label: ",
- SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
- PromptAdapterRequest("hate_speech2", 2, pa_path2, 8)),
- ("Tweet text: I have complaints! Label: ",
- SamplingParams(temperature=0.0, max_tokens=3), None),
- ("Tweet text: I have no problems Label: ",
- SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
- PromptAdapterRequest("complain", 3, pa_path, 8)),
- ]
-
- request_id = 0
- results = set()
- while prompts or engine.has_unfinished_requests():
- if prompts:
- prompt, sampling_params, pa_request = prompts.pop(0)
- engine.add_request(str(request_id),
- prompt,
- sampling_params,
- prompt_adapter_request=pa_request)
- request_id += 1
-
- request_outputs = engine.step()
-
- for request_output in request_outputs:
- if request_output.finished:
- results.add(request_output.outputs[0].text)
- return results
-
-
-def test_multi_prompt_adapters():
- engine_args = EngineArgs(model=MODEL_PATH,
- max_prompt_adapters=3,
- enable_prompt_adapter=True,
- max_prompt_adapter_token=8)
- engine = LLMEngine.from_engine_args(engine_args)
- expected_output = {
- ' quot;I', 'hate speech', 'no complaint', 'not hate speech'
- }
- assert do_sample(engine) == expected_output
diff --git a/tests/prompt_adapter/test_pa_lora.py b/tests/prompt_adapter/test_pa_lora.py
deleted file mode 100644
index ba2e15b81bc..00000000000
--- a/tests/prompt_adapter/test_pa_lora.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from huggingface_hub import snapshot_download
-
-from vllm import EngineArgs, LLMEngine, SamplingParams
-from vllm.lora.request import LoRARequest
-from vllm.prompt_adapter.request import PromptAdapterRequest
-
-MODEL_PATH = "meta-llama/Llama-2-7b-hf"
-pa_path = snapshot_download(repo_id="swapnilbp/llama_tweet_ptune")
-lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
-
-
-def do_sample(engine):
-
- prompt_text = "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]" # noqa: E501
-
- # first prompt with a prompt adapter and second without adapter
- prompts = [
- (prompt_text,
- SamplingParams(temperature=0.0, max_tokens=100,
- stop=["[/assistant]"]),
- PromptAdapterRequest("hate_speech", 1, pa_path,
- 8), LoRARequest("sql_test", 1, lora_path)),
- (prompt_text,
- SamplingParams(temperature=0.0, max_tokens=100,
- stop=["[/assistant]"]), None,
- LoRARequest("sql_test", 1, lora_path)),
- ]
-
- request_id = 0
- results = set()
- while prompts or engine.has_unfinished_requests():
- if prompts:
- prompt, sampling_params, pa_request, lora_request = prompts.pop(0)
- engine.add_request(str(request_id),
- prompt,
- sampling_params,
- prompt_adapter_request=pa_request,
- lora_request=lora_request)
- request_id += 1
-
- request_outputs = engine.step()
-
- for request_output in request_outputs:
- if request_output.finished:
- results.add(request_output.outputs[0].text)
- return results
-
-
-def test_lora_prompt_adapter():
- engine_args = EngineArgs(model=MODEL_PATH,
- enable_prompt_adapter=True,
- enable_lora=True,
- max_num_seqs=60,
- max_prompt_adapter_token=8)
- engine = LLMEngine.from_engine_args(engine_args)
- result = do_sample(engine)
-
- expected_output = {
- " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' " # noqa: E501
- }
- assert result == expected_output
diff --git a/tools/mypy.sh b/tools/mypy.sh
index 77d342da1ec..595cfe05662 100755
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -31,7 +31,6 @@ run_mypy vllm/inputs
run_mypy vllm/lora
run_mypy vllm/model_executor
run_mypy vllm/plugins
-run_mypy vllm/prompt_adapter
run_mypy vllm/spec_decode
run_mypy vllm/worker
run_mypy vllm/v1
diff --git a/vllm/config.py b/vllm/config.py
index 6c56ac1eec8..fe9ef7896a4 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3124,59 +3124,6 @@ def verify_lora_support(self):
"V1 LoRA does not support long LoRA, please use V0.")
-@config
-@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
-class PromptAdapterConfig:
- """Configuration for PromptAdapters."""
-
- max_prompt_adapters: int = 1
- """Max number of PromptAdapters in a batch."""
- max_prompt_adapter_token: int = 0
- """Max number of PromptAdapters tokens."""
- max_cpu_prompt_adapters: Optional[int] = None
- """Maximum number of PromptAdapters to store in CPU memory. Must be >= than
- `max_prompt_adapters`."""
- prompt_adapter_dtype: Union[torch.dtype, str] = "auto"
- """Data type for PromptAdapter. If auto, will default to base model dtype.
- """
-
- def compute_hash(self) -> str:
- """
- WARNING: Whenever a new field is added to this config,
- ensure that it is included in the factors list if
- it affects the computation graph.
-
- Provide a hash that uniquely identifies all the configs
- that affect the structure of the computation
- graph from input ids/embeddings to the final hidden states,
- excluding anything before input ids/embeddings and after
- the final hidden states.
- """
- # no factors to consider.
- # this config will not affect the computation graph.
- factors: list[Any] = []
- hash_str = hashlib.md5(str(factors).encode(),
- usedforsecurity=False).hexdigest()
- return hash_str
-
- def __post_init__(self):
-
- if self.max_prompt_adapters < 1:
- raise ValueError(f"max_prompt_adapters "
- f"({self.max_prompt_adapters}) must be >= 1.")
- if self.max_prompt_adapter_token == 0:
- raise ValueError("max_prompt_adapter_token must be set.")
- if self.max_cpu_prompt_adapters is None:
- self.max_cpu_prompt_adapters = self.max_prompt_adapters
-
- def verify_with_model_config(self, model_config: ModelConfig):
- if self.prompt_adapter_dtype == "auto":
- self.prompt_adapter_dtype = model_config.dtype
- elif isinstance(self.prompt_adapter_dtype, str):
- self.prompt_adapter_dtype = getattr(torch,
- self.prompt_adapter_dtype)
-
-
@config
@dataclass
class MultiModalConfig:
@@ -4382,8 +4329,6 @@ class VllmConfig:
"""Decoding configuration."""
observability_config: Optional[ObservabilityConfig] = None
"""Observability configuration."""
- prompt_adapter_config: Optional[PromptAdapterConfig] = None
- """Prompt adapter configuration."""
quant_config: Optional[QuantizationConfig] = None
"""Quantization configuration."""
compilation_config: CompilationConfig = field(
@@ -4480,10 +4425,6 @@ def compute_hash(self) -> str:
vllm_factors.append(self.observability_config.compute_hash())
else:
vllm_factors.append("None")
- if self.prompt_adapter_config:
- vllm_factors.append(self.prompt_adapter_config.compute_hash())
- else:
- vllm_factors.append("None")
if self.quant_config:
pass # should be captured by model_config.quantization
if self.compilation_config:
@@ -4592,9 +4533,6 @@ def __post_init__(self):
self.lora_config.verify_with_cache_config(self.cache_config)
self.lora_config.verify_with_model_config(self.model_config)
self.lora_config.verify_lora_support()
- if self.prompt_adapter_config is not None:
- self.prompt_adapter_config.verify_with_model_config(
- self.model_config)
if self.quant_config is None and self.model_config is not None:
self.quant_config = VllmConfig._get_quantization_config(
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 0ef0396996b..61346da145b 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -15,7 +15,6 @@
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
SequenceGroupBase, SequenceGroupMetadata,
SequenceGroupMetadataDelta, SequenceStage,
@@ -165,8 +164,6 @@ def __post_init__(self):
if self.num_loras > 0:
self._sort_by_lora_ids()
- self.num_prompt_adapters: int = len(self.prompt_adapter_requests)
-
def is_empty(self) -> bool:
# NOTE: We do not consider the ignored sequence groups.
return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
@@ -194,14 +191,6 @@ def lora_requests(self) -> Set[LoRARequest]:
if g.seq_group.lora_request is not None
}
- @property
- def prompt_adapter_requests(self) -> Set[PromptAdapterRequest]:
- return {
- g.seq_group.prompt_adapter_request
- for g in self.scheduled_seq_groups
- if g.seq_group.prompt_adapter_request is not None
- }
-
@dataclass
class SchedulerRunningOutputs:
@@ -1648,7 +1637,6 @@ def schedule(
multi_modal_placeholders=(
seq_group.multi_modal_placeholders
if scheduler_outputs.num_prefill_groups > 0 else None),
- prompt_adapter_request=seq_group.prompt_adapter_request,
)
else:
# When SPMD mode is enabled, we only send delta data except for
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7b73060e349..c522fb7b6ba 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -29,10 +29,9 @@
KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
ModelConfig, ModelDType, ModelImpl, MultiModalConfig,
ObservabilityConfig, ParallelConfig, PoolerConfig,
- PrefixCachingHashAlgo, PromptAdapterConfig,
- SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
- TaskOption, TokenizerMode, VllmConfig, get_attr_docs,
- get_field)
+ PrefixCachingHashAlgo, SchedulerConfig,
+ SchedulerPolicy, SpeculativeConfig, TaskOption,
+ TokenizerMode, VllmConfig, get_attr_docs, get_field)
from vllm.logger import init_logger
from vllm.platforms import CpuArchEnum, current_platform
from vllm.plugins import load_general_plugins
@@ -356,11 +355,6 @@ class EngineArgs:
lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
long_lora_scaling_factors: Optional[tuple[float, ...]] = \
LoRAConfig.long_lora_scaling_factors
- # PromptAdapter fields
- enable_prompt_adapter: bool = False
- max_prompt_adapters: int = PromptAdapterConfig.max_prompt_adapters
- max_prompt_adapter_token: int = \
- PromptAdapterConfig.max_prompt_adapter_token
device: Device = DeviceConfig.device
num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
@@ -728,23 +722,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
lora_group.add_argument("--default-mm-loras",
**lora_kwargs["default_mm_loras"])
- # PromptAdapter related configs
- prompt_adapter_kwargs = get_kwargs(PromptAdapterConfig)
- prompt_adapter_group = parser.add_argument_group(
- title="PromptAdapterConfig",
- description=PromptAdapterConfig.__doc__,
- )
- prompt_adapter_group.add_argument(
- "--enable-prompt-adapter",
- action=argparse.BooleanOptionalAction,
- help="If True, enable handling of PromptAdapters.")
- prompt_adapter_group.add_argument(
- "--max-prompt-adapters",
- **prompt_adapter_kwargs["max_prompt_adapters"])
- prompt_adapter_group.add_argument(
- "--max-prompt-adapter-token",
- **prompt_adapter_kwargs["max_prompt_adapter_token"])
-
# Device arguments
device_kwargs = get_kwargs(DeviceConfig)
device_group = parser.add_argument_group(
@@ -868,6 +845,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
parser.add_argument('--disable-log-stats',
action='store_true',
help='Disable logging statistics.')
+ parser.add_argument('--enable-prompt-adapter',
+ action='store_true',
+ deprecated=True,
+ help='[DEPRECATED] Prompt adapter has been '
+ 'removed. Setting this flag to True or False'
+ ' has no effect on vLLM behavior.')
return parser
@@ -1250,11 +1233,6 @@ def create_engine_config(
load_config = self.create_load_config()
- prompt_adapter_config = PromptAdapterConfig(
- max_prompt_adapters=self.max_prompt_adapters,
- max_prompt_adapter_token=self.max_prompt_adapter_token) \
- if self.enable_prompt_adapter else None
-
decoding_config = DecodingConfig(
backend=self.guided_decoding_backend,
disable_fallback=self.guided_decoding_disable_fallback,
@@ -1282,7 +1260,6 @@ def create_engine_config(
load_config=load_config,
decoding_config=decoding_config,
observability_config=observability_config,
- prompt_adapter_config=prompt_adapter_config,
compilation_config=self.compilation_config,
kv_transfer_config=self.kv_transfer_config,
kv_events_config=self.kv_events_config,
@@ -1371,12 +1348,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
recommend_to_remove=False)
return False
- # No Prompt Adapter so far.
- if self.enable_prompt_adapter:
- _raise_or_fallback(feature_name="--enable-prompt-adapter",
- recommend_to_remove=False)
- return False
-
# No text embedding inputs so far.
if self.enable_prompt_embeds:
_raise_or_fallback(feature_name="--enable-prompt-embeds",
@@ -1514,7 +1485,6 @@ def _set_default_args_v0(self, model_config: ModelConfig) -> None:
if (is_gpu and not use_sliding_window and not use_spec_decode
and not self.enable_lora
- and not self.enable_prompt_adapter
and model_config.runner_type != "pooling"):
self.enable_chunked_prefill = True
logger.warning(
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 3d7d28055dd..cbff032cdaa 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -29,7 +29,6 @@
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.pooling_params import PoolingParams
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.sequence import ExecuteModelRequest
from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -435,7 +434,6 @@ async def add_request_async(
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> None:
@@ -467,7 +465,6 @@ async def add_request_async(
processed_inputs = await self.input_preprocessor.preprocess_async(
prompt,
lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request,
)
if isinstance(params, SamplingParams) and \
@@ -489,7 +486,6 @@ async def add_request_async(
params=params,
arrival_time=arrival_time,
lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request,
trace_headers=trace_headers,
priority=priority,
)
@@ -859,7 +855,6 @@ async def add_request(
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
@@ -886,7 +881,6 @@ async def add_request(
arrival_time=arrival_time or time.time(),
lora_request=lora_request,
trace_headers=trace_headers,
- prompt_adapter_request=prompt_adapter_request,
priority=priority,
data_parallel_rank=data_parallel_rank,
)
@@ -900,7 +894,6 @@ async def generate(
request_id: str,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> AsyncGenerator[RequestOutput, None]:
@@ -918,8 +911,6 @@ async def generate(
request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any.
trace_headers: OpenTelemetry trace headers.
- prompt_adapter_request: Prompt Adapter request to use
- for generation, if any.
priority: The priority of the request.
Only applicable with priority scheduling.
data_parallel_rank: The (global) data parallel rank that must
@@ -979,7 +970,6 @@ async def generate(
sampling_params,
lora_request=lora_request,
trace_headers=trace_headers,
- prompt_adapter_request=prompt_adapter_request,
priority=priority,
data_parallel_rank=data_parallel_rank,
):
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 25fa1c3058b..df37d173ea8 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -44,7 +44,6 @@
from vllm.outputs import (PoolingRequestOutput, RequestOutput,
RequestOutputFactory)
from vllm.pooling_params import PoolingParams
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import RequestOutputKind, SamplingParams
from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
PoolingSequenceGroupOutput, Sequence, SequenceGroup,
@@ -223,7 +222,6 @@ def __init__(
self.load_config = vllm_config.load_config
self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa
)
- self.prompt_adapter_config = vllm_config.prompt_adapter_config # noqa
self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa
)
@@ -294,8 +292,6 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
# Feature flags
"enable_lora":
bool(self.lora_config),
- "enable_prompt_adapter":
- bool(self.prompt_adapter_config),
"enable_prefix_caching":
self.cache_config.enable_prefix_caching,
"enforce_eager":
@@ -542,9 +538,6 @@ def _verify_args(self) -> None:
self.lora_config.verify_with_model_config(self.model_config)
self.lora_config.verify_with_scheduler_config(
self.scheduler_config)
- if self.prompt_adapter_config:
- self.prompt_adapter_config.verify_with_model_config(
- self.model_config)
def _add_processed_request(
self,
@@ -553,7 +546,6 @@ def _add_processed_request(
params: Union[SamplingParams, PoolingParams],
arrival_time: float,
lora_request: Optional[LoRARequest],
- prompt_adapter_request: Optional[PromptAdapterRequest],
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
) -> Optional[SequenceGroup]:
@@ -569,7 +561,6 @@ def _add_processed_request(
arrival_time=arrival_time,
lora_request=lora_request,
trace_headers=trace_headers,
- prompt_adapter_request=prompt_adapter_request,
priority=priority,
)
return None
@@ -583,11 +574,10 @@ def _add_processed_request(
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
- lora_request, prompt_adapter_request)
+ lora_request)
encoder_seq = (None if encoder_inputs is None else Sequence(
- seq_id, encoder_inputs, block_size, eos_token_id, lora_request,
- prompt_adapter_request))
+ seq_id, encoder_inputs, block_size, eos_token_id, lora_request))
# Create a SequenceGroup based on SamplingParams or PoolingParams
if isinstance(params, SamplingParams):
@@ -598,7 +588,6 @@ def _add_processed_request(
arrival_time=arrival_time,
lora_request=lora_request,
trace_headers=trace_headers,
- prompt_adapter_request=prompt_adapter_request,
encoder_seq=encoder_seq,
priority=priority)
elif isinstance(params, PoolingParams):
@@ -608,7 +597,6 @@ def _add_processed_request(
params,
arrival_time=arrival_time,
lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request,
encoder_seq=encoder_seq,
priority=priority)
else:
@@ -637,7 +625,6 @@ def add_request(
lora_request: Optional[LoRARequest] = None,
tokenization_kwargs: Optional[dict[str, Any]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> None:
"""Add a request to the engine's request pool.
@@ -658,7 +645,6 @@ def add_request(
the current monotonic time.
lora_request: The LoRA request to add.
trace_headers: OpenTelemetry trace headers.
- prompt_adapter_request: The prompt adapter request to add.
priority: The priority of the request.
Only applicable with priority scheduling.
@@ -719,7 +705,6 @@ def add_request(
prompt,
tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request,
)
self._add_processed_request(
@@ -728,7 +713,6 @@ def add_request(
params=params,
arrival_time=arrival_time,
lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request,
trace_headers=trace_headers,
priority=priority,
)
@@ -741,7 +725,6 @@ def _create_sequence_group_with_sampling(
arrival_time: float,
lora_request: Optional[LoRARequest],
trace_headers: Optional[Mapping[str, str]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
encoder_seq: Optional[Sequence] = None,
priority: int = 0,
) -> SequenceGroup:
@@ -769,17 +752,15 @@ def _create_sequence_group_with_sampling(
if self.vllm_config.speculative_config is not None:
draft_size = \
self.vllm_config.speculative_config.num_speculative_tokens + 1
- seq_group = SequenceGroup(
- request_id=request_id,
- seqs=[seq],
- arrival_time=arrival_time,
- sampling_params=sampling_params,
- lora_request=lora_request,
- trace_headers=trace_headers,
- prompt_adapter_request=prompt_adapter_request,
- encoder_seq=encoder_seq,
- priority=priority,
- draft_size=draft_size)
+ seq_group = SequenceGroup(request_id=request_id,
+ seqs=[seq],
+ arrival_time=arrival_time,
+ sampling_params=sampling_params,
+ lora_request=lora_request,
+ trace_headers=trace_headers,
+ encoder_seq=encoder_seq,
+ priority=priority,
+ draft_size=draft_size)
return seq_group
@@ -790,7 +771,6 @@ def _create_sequence_group_with_pooling(
pooling_params: PoolingParams,
arrival_time: float,
lora_request: Optional[LoRARequest],
- prompt_adapter_request: Optional[PromptAdapterRequest],
encoder_seq: Optional[Sequence] = None,
priority: int = 0,
) -> SequenceGroup:
@@ -798,15 +778,13 @@ def _create_sequence_group_with_pooling(
# Defensive copy of PoolingParams, which are used by the pooler
pooling_params = pooling_params.clone()
# Create the sequence group.
- seq_group = SequenceGroup(
- request_id=request_id,
- seqs=[seq],
- arrival_time=arrival_time,
- lora_request=lora_request,
- pooling_params=pooling_params,
- prompt_adapter_request=prompt_adapter_request,
- encoder_seq=encoder_seq,
- priority=priority)
+ seq_group = SequenceGroup(request_id=request_id,
+ seqs=[seq],
+ arrival_time=arrival_time,
+ lora_request=lora_request,
+ pooling_params=pooling_params,
+ encoder_seq=encoder_seq,
+ priority=priority)
return seq_group
def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
@@ -1842,16 +1820,6 @@ def list_loras(self) -> Set[int]:
def pin_lora(self, lora_id: int) -> bool:
return self.model_executor.pin_lora(lora_id)
- def add_prompt_adapter(
- self, prompt_adapter_request: PromptAdapterRequest) -> bool:
- return self.model_executor.add_prompt_adapter(prompt_adapter_request)
-
- def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
- return self.model_executor.remove_prompt_adapter(prompt_adapter_id)
-
- def list_prompt_adapters(self) -> List[int]:
- return self.model_executor.list_prompt_adapters()
-
def start_profile(self) -> None:
self.model_executor.start_profile()
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index db968cd6b5d..ff0405d2f84 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -10,7 +10,6 @@
from vllm.inputs import PromptType
from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.utils import Device
@@ -33,7 +32,6 @@ class RPCProcessRequest:
request_id: str
lora_request: Optional[LoRARequest] = None
trace_headers: Optional[Mapping[str, str]] = None
- prompt_adapter_request: Optional[PromptAdapterRequest] = None
priority: int = 0
def __init__(
@@ -43,7 +41,6 @@ def __init__(
request_id: str,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> None:
super().__init__()
@@ -53,7 +50,6 @@ def __init__(
self.request_id = request_id
self.lora_request = lora_request
self.trace_headers = trace_headers
- self.prompt_adapter_request = prompt_adapter_request
self.priority = priority
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 9e018ec7f34..67d9a3bf6ce 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -45,7 +45,6 @@
from vllm.lora.request import LoRARequest
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.outputs import PoolingRequestOutput, RequestOutput
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
from vllm.utils import Device
@@ -448,7 +447,6 @@ def generate(
request_id: str,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> AsyncGenerator[RequestOutput, None]:
"""Generate outputs for a request.
@@ -465,8 +463,6 @@ def generate(
request_id: The unique id of the request.
lora_request: LoRA request to use for generation, if any.
trace_headers: OpenTelemetry trace headers.
- prompt_adapter_request: Prompt Adapter request to use
- for generation, if any.
priority: Priority of the request (lower means earlier handling).
Any priority other than 0 will lead to an error if the
scheduling policy is not "priority".
@@ -474,8 +470,7 @@ def generate(
return cast(
AsyncGenerator[RequestOutput, None],
self._process_request(prompt, sampling_params, request_id,
- lora_request, trace_headers,
- prompt_adapter_request, priority))
+ lora_request, trace_headers, priority))
def encode(
self,
@@ -521,7 +516,6 @@ async def _process_request(
request_id: str,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> Union[AsyncGenerator[RequestOutput, None], AsyncGenerator[
PoolingRequestOutput, None]]:
@@ -575,7 +569,6 @@ async def _process_request(
request_id=request_id,
lora_request=lora_request,
trace_headers=trace_headers,
- prompt_adapter_request=prompt_adapter_request,
priority=priority,
))
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index ef088bd3933..fe6eb0d8c2f 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -304,14 +304,12 @@ def _handle_process_request(self, request: RPCProcessRequest):
self._send_outputs(rpc_err)
try:
- self.engine.add_request(
- request_id=request_id,
- prompt=request.prompt,
- params=request.params,
- lora_request=request.lora_request,
- trace_headers=request.trace_headers,
- prompt_adapter_request=request.prompt_adapter_request,
- priority=request.priority)
+ self.engine.add_request(request_id=request_id,
+ prompt=request.prompt,
+ params=request.params,
+ lora_request=request.lora_request,
+ trace_headers=request.trace_headers,
+ priority=request.priority)
if self.log_requests:
logger.info("Added request %s.", request.request_id)
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 8688fcc82cd..85a261ac551 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -16,7 +16,6 @@
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
from vllm.pooling_params import PoolingParams
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import Device, collect_from_async_generator, random_uuid
@@ -55,7 +54,6 @@ def generate(
request_id: str,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> AsyncGenerator[RequestOutput, None]:
"""Generate outputs for a request."""
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e7398ecc23c..da4e109cff5 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -45,7 +45,6 @@
PoolingRequestOutput, RequestOutput,
ScoringRequestOutput)
from vllm.pooling_params import PoolingParams
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
RequestOutputKind, SamplingParams)
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
@@ -314,7 +313,6 @@ def generate(
*,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
guided_options_request: Optional[Union[LLMGuidedOptions,
GuidedDecodingRequest]] = None,
) -> list[RequestOutput]:
@@ -330,7 +328,6 @@ def generate(
prompt_token_ids: Optional[list[int]] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
guided_options_request: Optional[Union[LLMGuidedOptions,
GuidedDecodingRequest]] = None,
) -> list[RequestOutput]:
@@ -346,7 +343,6 @@ def generate(
prompt_token_ids: Optional[list[list[int]]] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
guided_options_request: Optional[Union[LLMGuidedOptions,
GuidedDecodingRequest]] = None,
) -> list[RequestOutput]:
@@ -363,7 +359,6 @@ def generate(
prompt_token_ids: list[int],
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
guided_options_request: Optional[Union[LLMGuidedOptions,
GuidedDecodingRequest]] = None,
) -> list[RequestOutput]:
@@ -380,7 +375,6 @@ def generate(
prompt_token_ids: list[list[int]],
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
guided_options_request: Optional[Union[LLMGuidedOptions,
GuidedDecodingRequest]] = None,
) -> list[RequestOutput]:
@@ -395,7 +389,6 @@ def generate(
prompt_token_ids: Union[list[int], list[list[int]]],
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
guided_options_request: Optional[Union[LLMGuidedOptions,
GuidedDecodingRequest]] = None,
) -> list[RequestOutput]:
@@ -415,7 +408,6 @@ def generate(
prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
guided_options_request: Optional[Union[LLMGuidedOptions,
GuidedDecodingRequest]] = None,
priority: Optional[list[int]] = None,
@@ -440,8 +432,6 @@ def generate(
it is used to create the progress bar.
If `False`, no progress bar is created.
lora_request: LoRA request to use for generation, if any.
- prompt_adapter_request: Prompt Adapter request to use for
- generation, if any.
priority: The priority of the requests, if any.
Only applicable when priority scheduling policy is enabled.
@@ -507,7 +497,6 @@ def generate(
params=sampling_params,
use_tqdm=use_tqdm,
lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request,
guided_options=guided_options_request,
tokenization_kwargs=tokenization_kwargs,
priority=priority,
@@ -963,7 +952,6 @@ def encode(
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> list[PoolingRequestOutput]:
...
@@ -978,7 +966,6 @@ def encode(
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> list[PoolingRequestOutput]:
...
@@ -993,7 +980,6 @@ def encode(
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> list[PoolingRequestOutput]:
...
@@ -1009,7 +995,6 @@ def encode(
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> list[PoolingRequestOutput]:
...
@@ -1025,7 +1010,6 @@ def encode(
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> list[PoolingRequestOutput]:
...
@@ -1039,7 +1023,6 @@ def encode(
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> list[PoolingRequestOutput]:
...
@@ -1058,7 +1041,6 @@ def encode(
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> list[PoolingRequestOutput]:
"""Apply pooling to the hidden states corresponding to the input
prompts.
@@ -1078,8 +1060,6 @@ def encode(
it is used to create the progress bar.
If `False`, no progress bar is created.
lora_request: LoRA request to use for generation, if any.
- prompt_adapter_request: Prompt Adapter request to use for
- generation, if any.
Returns:
A list of `PoolingRequestOutput` objects containing the
@@ -1132,7 +1112,6 @@ def encode(
use_tqdm=use_tqdm,
lora_request=lora_request,
tokenization_kwargs=tokenization_kwargs,
- prompt_adapter_request=prompt_adapter_request,
)
outputs = self._run_engine(use_tqdm=use_tqdm)
@@ -1149,7 +1128,6 @@ def embed(
pooling_params: Optional[Union[PoolingParams,
Sequence[PoolingParams]]] = None,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> list[EmbeddingRequestOutput]:
"""
Generate an embedding vector for each prompt.
@@ -1169,8 +1147,6 @@ def embed(
it is used to create the progress bar.
If `False`, no progress bar is created.
lora_request: LoRA request to use for generation, if any.
- prompt_adapter_request: Prompt Adapter request to use for
- generation, if any.
Returns:
A list of `EmbeddingRequestOutput` objects containing the
@@ -1185,8 +1161,7 @@ def embed(
truncate_prompt_tokens=truncate_prompt_tokens,
use_tqdm=use_tqdm,
pooling_params=pooling_params,
- lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request)
+ lora_request=lora_request)
return [EmbeddingRequestOutput.from_base(item) for item in items]
@@ -1197,7 +1172,6 @@ def classify(
*,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> list[ClassificationRequestOutput]:
"""
Generate class logits for each prompt.
@@ -1215,8 +1189,6 @@ def classify(
it is used to create the progress bar.
If `False`, no progress bar is created.
lora_request: LoRA request to use for generation, if any.
- prompt_adapter_request: Prompt Adapter request to use for
- generation, if any.
Returns:
A list of `ClassificationRequestOutput` objects containing the
@@ -1230,8 +1202,7 @@ def classify(
items = self.encode(prompts,
use_tqdm=use_tqdm,
- lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request)
+ lora_request=lora_request)
return [ClassificationRequestOutput.from_base(item) for item in items]
@@ -1243,15 +1214,13 @@ def _embedding_score(
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> list[ScoringRequestOutput]:
encoded_output: list[PoolingRequestOutput] = self.encode(
text_1 + text_2,
truncate_prompt_tokens=truncate_prompt_tokens,
use_tqdm=use_tqdm,
- lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request)
+ lora_request=lora_request)
encoded_output_1: list[PoolingRequestOutput] = encoded_output[
0:len(text_1)]
@@ -1277,7 +1246,6 @@ def _cross_encoding_score(
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> list[ScoringRequestOutput]:
if isinstance(tokenizer, MistralTokenizer):
@@ -1335,7 +1303,6 @@ def _cross_encoding_score(
params=pooling_params,
use_tqdm=use_tqdm,
lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request,
)
outputs = self._run_engine(use_tqdm=use_tqdm)
@@ -1355,7 +1322,6 @@ def score(
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> list[ScoringRequestOutput]:
"""Generate similarity scores for all pairs `` or
``.
@@ -1386,8 +1352,6 @@ def score(
it is used to create the progress bar.
If `False`, no progress bar is created.
lora_request: LoRA request to use for generation, if any.
- prompt_adapter_request: Prompt Adapter request to use for
- generation, if any.
Returns:
A list of `ScoringRequestOutput` objects containing the
@@ -1478,8 +1442,7 @@ def ensure_str(prompt: SingletonPrompt):
data_2, # type: ignore[arg-type]
truncate_prompt_tokens,
use_tqdm,
- lora_request,
- prompt_adapter_request)
+ lora_request)
else:
return self._embedding_score(
tokenizer,
@@ -1487,8 +1450,7 @@ def ensure_str(prompt: SingletonPrompt):
data_2, # type: ignore[arg-type]
truncate_prompt_tokens,
use_tqdm,
- lora_request,
- prompt_adapter_request)
+ lora_request)
def start_profile(self) -> None:
self.llm_engine.start_profile()
@@ -1599,7 +1561,6 @@ def _validate_and_add_requests(
*,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
- prompt_adapter_request: Optional[PromptAdapterRequest],
tokenization_kwargs: Optional[dict[str, Any]] = None,
guided_options: Optional[GuidedDecodingRequest] = None,
priority: Optional[list[int]] = None,
@@ -1645,7 +1606,6 @@ def _validate_and_add_requests(
tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request[i] if isinstance(
lora_request, Sequence) else lora_request,
- prompt_adapter_request=prompt_adapter_request,
priority=priority[i] if priority else 0,
)
@@ -1655,7 +1615,6 @@ def _add_request(
params: Union[SamplingParams, PoolingParams],
tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> None:
request_id = str(next(self.request_counter))
@@ -1665,7 +1624,6 @@ def _add_request(
params,
lora_request=lora_request,
tokenization_kwargs=tokenization_kwargs,
- prompt_adapter_request=prompt_adapter_request,
priority=priority,
)
diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py
index f3aee188dae..06ff3b417f8 100644
--- a/vllm/entrypoints/logger.py
+++ b/vllm/entrypoints/logger.py
@@ -8,7 +8,6 @@
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.pooling_params import PoolingParams
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import BeamSearchParams, SamplingParams
logger = init_logger(__name__)
@@ -30,7 +29,6 @@ def log_inputs(
params: Optional[Union[SamplingParams, PoolingParams,
BeamSearchParams]],
lora_request: Optional[LoRARequest],
- prompt_adapter_request: Optional[PromptAdapterRequest],
) -> None:
max_log_len = self.max_log_len
if max_log_len is not None:
@@ -44,7 +42,6 @@ def log_inputs(
"Received request %s: prompt: %r, "
"params: %s, prompt_token_ids: %s, "
"prompt_embeds shape: %s, "
- "lora_request: %s, prompt_adapter_request: %s.", request_id,
- prompt, params, prompt_token_ids,
+ "lora_request: %s.", request_id, prompt, params, prompt_token_ids,
prompt_embeds.shape if prompt_embeds is not None else None,
- lora_request, prompt_adapter_request)
+ lora_request)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 19d0110ff37..df1607ad4cc 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1502,7 +1502,6 @@ async def init_app_state(
model_config=model_config,
base_model_paths=base_model_paths,
lora_modules=lora_modules,
- prompt_adapters=args.prompt_adapters,
)
await state.openai_serving_models.init_static_loras()
state.openai_serving_responses = OpenAIServingResponses(
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index bccce73b79f..1cba3d89d91 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -20,8 +20,7 @@
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
validate_chat_template)
-from vllm.entrypoints.openai.serving_models import (LoRAModulePath,
- PromptAdapterPath)
+from vllm.entrypoints.openai.serving_models import LoRAModulePath
from vllm.entrypoints.openai.tool_parsers import ToolParserManager
from vllm.logger import init_logger
from vllm.utils import FlexibleArgumentParser
@@ -65,27 +64,6 @@ def __call__(
setattr(namespace, self.dest, lora_list)
-class PromptAdapterParserAction(argparse.Action):
-
- def __call__(
- self,
- parser: argparse.ArgumentParser,
- namespace: argparse.Namespace,
- values: Optional[Union[str, Sequence[str]]],
- option_string: Optional[str] = None,
- ):
- if values is None:
- values = []
- if isinstance(values, str):
- raise TypeError("Expected values to be a list")
-
- adapter_list: list[PromptAdapterPath] = []
- for item in values:
- name, path = item.split('=')
- adapter_list.append(PromptAdapterPath(name, path))
- setattr(namespace, self.dest, adapter_list)
-
-
@config
@dataclass
class FrontendArgs:
@@ -115,9 +93,6 @@ class FrontendArgs:
or JSON list format. Example (old format): `'name=path'` Example (new
format): `{\"name\": \"name\", \"path\": \"lora_path\",
\"base_model_name\": \"id\"}`"""
- prompt_adapters: Optional[list[PromptAdapterPath]] = None
- """Prompt adapter configurations in the format name=path. Multiple adapters
- can be specified."""
chat_template: Optional[str] = None
"""The file path to the chat template, or the template in single-line form
for the specified model."""
@@ -204,12 +179,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
frontend_kwargs["lora_modules"]["type"] = optional_type(str)
frontend_kwargs["lora_modules"]["action"] = LoRAParserAction
- # Special case: Prompt adapters need custom parser action and
- # optional_type(str)
- frontend_kwargs["prompt_adapters"]["type"] = optional_type(str)
- frontend_kwargs["prompt_adapters"][
- "action"] = PromptAdapterParserAction
-
# Special case: Middleware needs append action
frontend_kwargs["middleware"]["action"] = "append"
@@ -281,9 +250,6 @@ def validate_parsed_serve_args(args: argparse.Namespace):
if args.enable_auto_tool_choice and not args.tool_call_parser:
raise TypeError("Error: --enable-auto-tool-choice requires "
"--tool-call-parser")
- if args.enable_prompt_embeds and args.enable_prompt_adapter:
- raise ValueError(
- "Cannot use prompt embeds and prompt adapter at the same time.")
def log_non_default_args(args: argparse.Namespace):
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 3dc5826909a..ef5bf6f9a81 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -337,7 +337,6 @@ async def main(args):
model_config=model_config,
base_model_paths=base_model_paths,
lora_modules=None,
- prompt_adapters=None,
)
openai_serving_chat = OpenAIServingChat(
engine,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index b902166a25b..53215a2fa65 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -147,11 +147,8 @@ async def create_chat_completion(
raise self.engine_client.dead_error
try:
- (
- lora_request,
- prompt_adapter_request,
- ) = self._maybe_get_adapters(request,
- supports_default_mm_loras=True)
+ lora_request = self._maybe_get_adapters(
+ request, supports_default_mm_loras=True)
model_name = self._get_model_name(request.model, lora_request)
@@ -239,8 +236,7 @@ async def create_chat_completion(
self._log_inputs(request_id,
request_prompts[i],
params=sampling_params,
- lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request)
+ lora_request=lora_request)
trace_headers = (None if raw_request is None else await
self._get_trace_headers(raw_request.headers))
@@ -259,7 +255,6 @@ async def create_chat_completion(
request_id,
lora_request=lora_request,
trace_headers=trace_headers,
- prompt_adapter_request=prompt_adapter_request,
priority=request.priority,
)
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index 3ac4f01ea60..68556864049 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -46,19 +46,11 @@ async def _preprocess(
return None
try:
- (
- ctx.lora_request,
- ctx.prompt_adapter_request,
- ) = self._maybe_get_adapters(ctx.request)
+ ctx.lora_request = self._maybe_get_adapters(ctx.request)
ctx.tokenizer = await self.engine_client.get_tokenizer(
ctx.lora_request)
- if ctx.prompt_adapter_request is not None:
- raise NotImplementedError(
- "Prompt adapter is not supported for classification models"
- )
-
(
ctx.request_prompts,
ctx.engine_prompts,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index eb9a35a7a37..a0a5239637e 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -116,10 +116,7 @@ async def create_completion(
raw_request.state.request_metadata = request_metadata
try:
- (
- lora_request,
- prompt_adapter_request,
- ) = self._maybe_get_adapters(request)
+ lora_request = self._maybe_get_adapters(request)
tokenizer = await self.engine_client.get_tokenizer(lora_request)
@@ -187,8 +184,7 @@ async def create_completion(
self._log_inputs(request_id_item,
request_prompts[i],
params=sampling_params,
- lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request)
+ lora_request=lora_request)
trace_headers = (None if raw_request is None else await
self._get_trace_headers(raw_request.headers))
@@ -211,7 +207,6 @@ async def create_completion(
sampling_params,
request_id_item,
lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request,
trace_headers=trace_headers,
priority=request.priority,
)
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index e87decfe636..3a216a0fe06 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -51,18 +51,11 @@ async def _preprocess(
) -> Optional[ErrorResponse]:
ctx = cast(EmbeddingServeContext, ctx)
try:
- (
- ctx.lora_request,
- ctx.prompt_adapter_request,
- ) = self._maybe_get_adapters(ctx.request)
+ ctx.lora_request = self._maybe_get_adapters(ctx.request)
tokenizer = await self.engine_client.get_tokenizer(ctx.lora_request
)
- if ctx.prompt_adapter_request is not None:
- raise NotImplementedError("Prompt adapter is not supported "
- "for embedding models")
-
if isinstance(ctx.request, EmbeddingChatRequest):
(
_,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 462317a0878..ba22a955902 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -68,7 +68,6 @@
MultiModalDataDict)
from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.pooling_params import PoolingParams
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.sequence import Logprob, PromptLogprobs
from vllm.tracing import (contains_trace_headers, extract_trace_headers,
@@ -161,7 +160,6 @@ class ServeContext(RequestProcessingMixin, ResponseGenerationMixin, BaseModel,
request_id: str
created_time: int = Field(default_factory=lambda: int(time.time()))
lora_request: Optional[LoRARequest] = None
- prompt_adapter_request: Optional[PromptAdapterRequest] = None
# Shared across most requests
tokenizer: Optional[AnyTokenizer] = None
@@ -335,12 +333,10 @@ async def _prepare_generators(
return self.create_error_response(
"Request prompts not available")
- self._log_inputs(
- request_id_item,
- ctx.request_prompts[i],
- params=pooling_params,
- lora_request=ctx.lora_request,
- prompt_adapter_request=ctx.prompt_adapter_request)
+ self._log_inputs(request_id_item,
+ ctx.request_prompts[i],
+ params=pooling_params,
+ lora_request=ctx.lora_request)
# Mypy has an existing bug related to inferring the variance of
# TypedDicts with `builtins.enumerate`:
@@ -442,11 +438,6 @@ async def _check_model(
if isinstance(load_result, ErrorResponse) and \
load_result.code == HTTPStatus.BAD_REQUEST.value:
error_response = load_result
- if request.model in [
- prompt_adapter.prompt_adapter_name
- for prompt_adapter in self.models.prompt_adapter_requests
- ]:
- return None
return error_response or self.create_error_response(
message=f"The model `{request.model}` does not exist.",
@@ -481,25 +472,21 @@ def _maybe_get_adapters(
self,
request: AnyRequest,
supports_default_mm_loras: bool = False,
- ) -> Union[tuple[None, None], tuple[LoRARequest, None], tuple[
- None, PromptAdapterRequest]]:
+ ) -> Optional[LoRARequest]:
if request.model in self.models.lora_requests:
- return self.models.lora_requests[request.model], None
+ return self.models.lora_requests[request.model]
# Currently only support default modality specific loras
# if we have exactly one lora matched on the request.
if supports_default_mm_loras:
default_mm_lora = self._get_active_default_mm_loras(request)
if default_mm_lora is not None:
- return default_mm_lora, None
+ return default_mm_lora
if self._is_model_supported(request.model):
- return None, None
+ return None
- for prompt_adapter in self.models.prompt_adapter_requests:
- if request.model == prompt_adapter.prompt_adapter_name:
- return None, prompt_adapter
# if _check_model has been called earlier, this will be unreachable
raise ValueError(f"The model `{request.model}` does not exist.")
@@ -979,7 +966,6 @@ def _log_inputs(
params: Optional[Union[SamplingParams, PoolingParams,
BeamSearchParams]],
lora_request: Optional[LoRARequest],
- prompt_adapter_request: Optional[PromptAdapterRequest],
) -> None:
if self.request_logger is None:
return
@@ -1001,7 +987,6 @@ def _log_inputs(
prompt_embeds,
params=params,
lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request,
)
async def _get_trace_headers(
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index bc4f523c82e..27614fcb411 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -1,8 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import json
-import pathlib
from asyncio import Lock
from collections import defaultdict
from dataclasses import dataclass
@@ -19,7 +17,6 @@
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.utils import AtomicCounter
logger = init_logger(__name__)
@@ -31,12 +28,6 @@ class BaseModelPath:
model_path: str
-@dataclass
-class PromptAdapterPath:
- name: str
- local_path: str
-
-
@dataclass
class LoRAModulePath:
name: str
@@ -60,7 +51,6 @@ def __init__(
base_model_paths: list[BaseModelPath],
*,
lora_modules: Optional[list[LoRAModulePath]] = None,
- prompt_adapters: Optional[list[PromptAdapterPath]] = None,
):
super().__init__()
@@ -81,20 +71,6 @@ def __init__(
LoRAResolverRegistry.get_resolver(lora_resolver_name))
self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)
- self.prompt_adapter_requests = []
- if prompt_adapters is not None:
- for i, prompt_adapter in enumerate(prompt_adapters, start=1):
- with pathlib.Path(prompt_adapter.local_path,
- "adapter_config.json").open() as f:
- adapter_config = json.load(f)
- num_virtual_tokens = adapter_config["num_virtual_tokens"]
- self.prompt_adapter_requests.append(
- PromptAdapterRequest(
- prompt_adapter_name=prompt_adapter.name,
- prompt_adapter_id=i,
- prompt_adapter_local_path=prompt_adapter.local_path,
- prompt_adapter_num_virtual_tokens=num_virtual_tokens))
-
async def init_static_loras(self):
"""Loads all static LoRA modules.
Raises if any fail to load"""
@@ -141,14 +117,7 @@ async def show_available_models(self) -> ModelList:
permission=[ModelPermission()])
for lora in self.lora_requests.values()
]
- prompt_adapter_cards = [
- ModelCard(id=prompt_adapter.prompt_adapter_name,
- root=self.base_model_paths[0].name,
- permission=[ModelPermission()])
- for prompt_adapter in self.prompt_adapter_requests
- ]
model_cards.extend(lora_cards)
- model_cards.extend(prompt_adapter_cards)
return ModelList(data=model_cards)
async def load_lora_adapter(
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index c2ed50d04d1..6c1e43e338d 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -94,17 +94,10 @@ async def create_pooling(
try:
truncate_prompt_tokens = _validate_truncation_size(
self.max_model_len, truncate_prompt_tokens)
- (
- lora_request,
- prompt_adapter_request,
- ) = self._maybe_get_adapters(request)
+ lora_request = self._maybe_get_adapters(request)
tokenizer = await self.engine_client.get_tokenizer(lora_request)
- if prompt_adapter_request is not None:
- raise NotImplementedError("Prompt adapter is not supported "
- "for pooling models")
-
if isinstance(request, PoolingChatRequest):
(
_,
@@ -148,8 +141,7 @@ async def create_pooling(
self._log_inputs(request_id_item,
request_prompts[i],
params=pooling_params,
- lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request)
+ lora_request=lora_request)
trace_headers = (None if raw_request is None else await
self._get_trace_headers(raw_request.headers))
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index f7bde6e243b..f2af0579911 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -133,10 +133,7 @@ async def create_responses(
messages = self._construct_input_messages(request, prev_response)
try:
- (
- lora_request,
- prompt_adapter_request,
- ) = self._maybe_get_adapters(request)
+ lora_request = self._maybe_get_adapters(request)
model_name = self._get_model_name(request.model, lora_request)
tokenizer = await self.engine_client.get_tokenizer(lora_request)
@@ -169,8 +166,7 @@ async def create_responses(
self._log_inputs(request.request_id,
request_prompts[i],
params=sampling_params,
- lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request)
+ lora_request=lora_request)
trace_headers = (None if raw_request is None else await
self._get_trace_headers(raw_request.headers))
@@ -181,7 +177,6 @@ async def create_responses(
request.request_id,
lora_request=lora_request,
trace_headers=trace_headers,
- prompt_adapter_request=prompt_adapter_request,
priority=request.priority,
)
generators.append(generator)
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 8d47a417f9c..8191e67d681 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -27,7 +27,6 @@
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.utils import make_async, merge_async_iterators
@@ -55,11 +54,9 @@ async def _embedding_score(
texts_1: list[str],
texts_2: list[str],
request: Union[RerankRequest, ScoreRequest],
- request_id=str,
+ request_id: str,
tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[Union[LoRARequest, None]] = None,
- prompt_adapter_request: Optional[Union[PromptAdapterRequest,
- None]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
) -> list[PoolingRequestOutput]:
@@ -96,8 +93,7 @@ async def _embedding_score(
self._log_inputs(request_id_item,
input_texts[i],
params=pooling_params,
- lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request)
+ lora_request=lora_request)
generators.append(
self.engine_client.encode(
@@ -169,11 +165,9 @@ async def _cross_encoding_score(
data_1: Union[list[str], list[ScoreContentPartParam]],
data_2: Union[list[str], list[ScoreContentPartParam]],
request: Union[RerankRequest, ScoreRequest],
- request_id=str,
+ request_id: str,
tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[Union[LoRARequest, None]] = None,
- prompt_adapter_request: Optional[Union[PromptAdapterRequest,
- None]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
) -> list[PoolingRequestOutput]:
@@ -253,8 +247,7 @@ async def _cross_encoding_score(
self._log_inputs(request_id_item,
request_prompts[i],
params=pooling_params,
- lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request)
+ lora_request=lora_request)
generator = self.engine_client.encode(
engine_prompt,
@@ -288,14 +281,7 @@ async def _run_scoring(
truncate_prompt_tokens: Optional[int] = None,
) -> list[PoolingRequestOutput]:
- (
- lora_request,
- prompt_adapter_request,
- ) = self._maybe_get_adapters(request)
-
- if prompt_adapter_request is not None:
- raise NotImplementedError("Prompt adapter is not supported "
- "for scoring models")
+ lora_request = self._maybe_get_adapters(request)
tokenizer = await self.engine_client.get_tokenizer(lora_request)
@@ -333,7 +319,6 @@ async def _run_scoring(
request_id=request_id,
tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request,
trace_headers=trace_headers)
else:
@@ -345,7 +330,6 @@ async def _run_scoring(
request_id=request_id,
tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request,
trace_headers=trace_headers)
async def create_score(
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 3db0a71fadd..8faa92b37f2 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -58,10 +58,7 @@ async def create_tokenize(
request_id = f"tokn-{self._base_request_id(raw_request)}"
try:
- (
- lora_request,
- prompt_adapter_request,
- ) = self._maybe_get_adapters(request)
+ lora_request = self._maybe_get_adapters(request)
tokenizer = await self.engine_client.get_tokenizer(lora_request)
@@ -102,11 +99,8 @@ async def create_tokenize(
self._log_inputs(request_id,
request_prompts[i],
params=None,
- lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request)
+ lora_request=lora_request)
- # Silently ignore prompt adapter since it does not affect
- # tokenization (Unlike in Embeddings API where an error is raised)
if isinstance(engine_prompt,
dict) and "prompt_token_ids" in engine_prompt:
input_ids.extend(engine_prompt["prompt_token_ids"])
@@ -131,21 +125,14 @@ async def create_detokenize(
request_id = f"tokn-{self._base_request_id(raw_request)}"
- (
- lora_request,
- prompt_adapter_request,
- ) = self._maybe_get_adapters(request)
+ lora_request = self._maybe_get_adapters(request)
tokenizer = await self.engine_client.get_tokenizer(lora_request)
self._log_inputs(request_id,
request.tokens,
params=None,
- lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request)
-
- # Silently ignore prompt adapter since it does not affect tokenization
- # (Unlike in Embeddings API where an error is raised)
+ lora_request=lora_request)
prompt_input = await self._tokenize_prompt_input_async(
request,
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index e7589a3804c..b2aa0968a41 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -150,19 +150,12 @@ async def _create_speech_to_text(
raw_request.state.request_metadata = request_metadata
try:
- (
- lora_request,
- prompt_adapter_request,
- ) = self._maybe_get_adapters(request)
+ lora_request = self._maybe_get_adapters(request)
if lora_request:
return self.create_error_response(
"Currently do not support LoRA for "
f"{self.task_type.title()}.")
- if prompt_adapter_request:
- return self.create_error_response(
- f"Currently do not support PromptAdapter for "
- f"{self.task_type.title()}.")
prompts, duration_s = await self._preprocess_speech_to_text(
request=request,
@@ -188,8 +181,7 @@ async def _create_speech_to_text(
# It will not display special tokens like <|startoftranscript|>
request.prompt,
params=sampling_params,
- lora_request=None,
- prompt_adapter_request=None)
+ lora_request=None)
list_result_generator = [
self.engine_client.generate(
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 99e12201c96..bebe7afe5d8 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -15,7 +15,6 @@
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sequence import ExecuteModelRequest, PoolerOutput
from vllm.utils import make_async
from vllm.worker.worker_base import WorkerBase
@@ -48,7 +47,6 @@ def __init__(
self.scheduler_config = vllm_config.scheduler_config
self.device_config = vllm_config.device_config
self.speculative_config = vllm_config.speculative_config
- self.prompt_adapter_config = vllm_config.prompt_adapter_config
self.observability_config = vllm_config.observability_config
self._init_executor()
self.is_sleeping = False
@@ -164,35 +162,6 @@ def list_loras(self) -> Set[int]:
assert s == sets[0], "All workers should have the same LORAs."
return sets[0]
- def add_prompt_adapter(
- self, prompt_adapter_request: PromptAdapterRequest) -> bool:
- assert prompt_adapter_request.prompt_adapter_id > 0, \
- "prompt_adapter_id must be greater than 0."
- return all(
- self.collective_rpc("add_prompt_adapter",
- args=(prompt_adapter_request, )))
-
- def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
- assert prompt_adapter_id > 0, \
- "prompt_adapter_id must be greater than 0."
- return all(
- self.collective_rpc("remove_prompt_adapter",
- args=(prompt_adapter_id, )))
-
- def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
- assert prompt_adapter_id > 0, \
- "prompt_adapter_id must be greater than 0."
- return all(
- self.collective_rpc("pin_prompt_adapter",
- args=(prompt_adapter_id, )))
-
- def list_prompt_adapters(self) -> Set[int]:
- sets = self.collective_rpc("list_prompt_adapters")
- for s in sets:
- assert (s == sets[0]
- ), "All workers should have the same prompt adapters."
- return sets[0]
-
def start_profile(self) -> None:
self.collective_rpc("start_profile")
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index deda9bc23da..de5dc087665 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -13,7 +13,6 @@
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
MultiModalInputs)
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
@@ -168,18 +167,6 @@ def _prepare_decoder_input_ids_for_generation(
return decoder_input_ids
- def _apply_prompt_adapter(
- self,
- prompt_token_ids: list[int],
- prompt_adapter_request: Optional[PromptAdapterRequest],
- ) -> list[int]:
- if prompt_adapter_request:
- prompt_token_ids = (
- [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
- + prompt_token_ids)
-
- return prompt_token_ids
-
def _get_tokenization_kw(
self,
overrides: Optional[dict[str, Any]] = None,
@@ -786,15 +773,10 @@ async def _process_encoder_decoder_prompt_async(
def _build_decoder_only_llm_inputs(
self,
prompt_inputs: DecoderOnlyInputs,
- prompt_adapter_request: Optional[PromptAdapterRequest],
) -> DecoderOnlyInputs:
if "prompt_token_ids" in prompt_inputs:
prompt_inputs = cast(Union[TokenInputs, MultiModalInputs],
prompt_inputs) # Needed for mypy
- prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter(
- prompt_inputs["prompt_token_ids"],
- prompt_adapter_request=prompt_adapter_request,
- )
return prompt_inputs
@@ -803,7 +785,6 @@ def _process_decoder_only_prompt(
prompt: SingletonPrompt,
tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
) -> DecoderOnlyInputs:
"""
@@ -815,7 +796,6 @@ def _process_decoder_only_prompt(
* prompt: input prompt
* lora_request
- * prompt_adapter_request
* return_mm_hashes
Returns:
@@ -830,17 +810,13 @@ def _process_decoder_only_prompt(
return_mm_hashes=return_mm_hashes,
)
- return self._build_decoder_only_llm_inputs(
- prompt_comps,
- prompt_adapter_request=prompt_adapter_request,
- )
+ return self._build_decoder_only_llm_inputs(prompt_comps)
async def _process_decoder_only_prompt_async(
self,
prompt: SingletonPrompt,
tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
) -> DecoderOnlyInputs:
"""
@@ -854,17 +830,13 @@ async def _process_decoder_only_prompt_async(
return_mm_hashes=return_mm_hashes,
)
- return self._build_decoder_only_llm_inputs(
- prompt_comps,
- prompt_adapter_request=prompt_adapter_request,
- )
+ return self._build_decoder_only_llm_inputs(prompt_comps)
def preprocess(
self,
prompt: PromptType,
tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
) -> ProcessorInputs:
"""Preprocess the input prompt."""
@@ -886,7 +858,6 @@ def preprocess(
prompt,
tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request,
return_mm_hashes=return_mm_hashes,
)
@@ -895,7 +866,6 @@ async def preprocess_async(
prompt: PromptType,
tokenization_kwargs: Optional[dict[str, Any]] = None,
lora_request: Optional[LoRARequest] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
) -> ProcessorInputs:
"""
@@ -919,6 +889,5 @@ async def preprocess_async(
prompt,
tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request,
return_mm_hashes=return_mm_hashes,
)
diff --git a/vllm/prompt_adapter/__init__.py b/vllm/prompt_adapter/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py
deleted file mode 100644
index b5b925d042f..00000000000
--- a/vllm/prompt_adapter/layers.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from dataclasses import dataclass
-from typing import Optional
-
-import torch
-from torch import nn
-
-from vllm.adapter_commons.layers import AdapterMapping
-from vllm.config import PromptAdapterConfig
-from vllm.model_executor.layers.vocab_parallel_embedding import (
- VocabParallelEmbedding)
-
-
-@dataclass
-class PromptAdapterMapping(AdapterMapping):
- pass
-
-
-class VocabParallelEmbeddingWithPromptAdapter(nn.Module):
-
- def __init__(self, base_layer: VocabParallelEmbedding) -> None:
- super().__init__()
- self.base_layer = base_layer
- self.emb_layer = self.base_layer
- if 'LoRA' in base_layer.__class__.__name__:
- self.emb_layer = self.base_layer.base_layer
-
- def create_prompt_adapter_weights(
- self, prompt_adapter_config: PromptAdapterConfig):
- self.embeddings_tensors = torch.zeros(
- (
- prompt_adapter_config.max_prompt_adapters,
- prompt_adapter_config.max_prompt_adapter_token,
- self.emb_layer.embedding_dim,
- ),
- dtype=self.emb_layer.weight.dtype,
- device=self.emb_layer.weight.device,
- )
- self.adapter_lengths = torch.zeros(
- prompt_adapter_config.max_prompt_adapters,
- dtype=torch.long,
- device=self.emb_layer.weight.device)
-
- self.indices_gpu: torch.Tensor
- self.embedding_indices_gpu: torch.Tensor
-
- def reset_prompt_adapter(self, index: int):
- self.embeddings_tensors[index] = 0
-
- def set_prompt_adapter(
- self,
- index: int,
- adapter_model: Optional[torch.Tensor],
- ):
- self.reset_prompt_adapter(index)
- if adapter_model is not None:
- length = adapter_model.shape[0]
- self.embeddings_tensors[index, :length] = adapter_model
- self.adapter_lengths[index] = length
-
- def set_mapping(
- self,
- prompt_indices: torch.Tensor,
- prompt_embedding_indices: torch.Tensor,
- ):
- self.indices_gpu = prompt_indices.to(
- device=self.emb_layer.weight.device)
- self.embedding_indices_gpu = prompt_embedding_indices.to(
- device=self.emb_layer.weight.device)
-
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- hidden_states = self.base_layer(x)
- if self.embedding_indices_gpu.ndim > 1:
- valid_mask = self.indices_gpu != -1
- gathered_embeddings = self.embeddings_tensors[
- self.embedding_indices_gpu[:, 0],
- self.embedding_indices_gpu[:, 1]]
-
- # Update hidden states
- hidden_states[valid_mask] = gathered_embeddings
- return hidden_states
\ No newline at end of file
diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py
deleted file mode 100644
index 864b50c861e..00000000000
--- a/vllm/prompt_adapter/models.py
+++ /dev/null
@@ -1,358 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import logging
-import math
-from typing import Any, Callable, Dict, List, Optional, Type
-
-import torch
-from torch import nn
-
-from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel,
- AdapterModelManager)
-from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter,
- get_adapter, list_adapters,
- remove_adapter, set_adapter_mapping)
-from vllm.config import PromptAdapterConfig
-from vllm.prompt_adapter.layers import (
- VocabParallelEmbeddingWithPromptAdapter) # yapf: disable
-from vllm.prompt_adapter.layers import PromptAdapterMapping
-from vllm.prompt_adapter.utils import load_peft_weights
-
-logger = logging.getLogger(__name__)
-
-_GLOBAL_PROMPT_ADAPTER_ID = 0
-
-
-def get_prompt_adapter_id():
- global _GLOBAL_PROMPT_ADAPTER_ID
- _GLOBAL_PROMPT_ADAPTER_ID += 1
- return _GLOBAL_PROMPT_ADAPTER_ID
-
-
-def convert_to_embedding_indices(indices):
- embedding_indices = []
- count = 0
-
- for value in indices:
- if value == -1:
- count = 0
- else:
- embedding_indices.append([value, count])
- count += 1
-
- return torch.tensor(embedding_indices)
-
-
-def convert_mapping(
- mapping: PromptAdapterMapping,
- prompt_adapter_index_to_id: List[Optional[int]],
-) -> torch.Tensor:
- """Converts PromptAdapterMapping to index tensors.
-
- Args:
- mapping: PromptAdapterMapping mapping rows in a
- batch to PromptAdapter ids.
- prompt_adapter_index_to_id: List mapping PromptAdapter
- ids to PromptAdapter indices.
-
- Returns:
- pa_indices: Tensor of shape [batch_size] mapping batch rows to
- PromptAdapter indices.
- """
- id_to_index = {
- id_: idx
- for idx, id_ in enumerate(prompt_adapter_index_to_id)
- if id_ is not None
- }
- pa_indices = ([
- id_to_index.get(id_, -1) if id_ > 0 else -1
- for id_ in mapping.index_mapping
- ])
-
- pa_embedding_mapping = convert_to_embedding_indices(pa_indices)
- pa_indices = torch.tensor(pa_indices)
- return pa_indices, pa_embedding_mapping
-
-
-class PromptAdapterModel(AdapterModel):
-
- def __init__(self,
- prompt_adapter_id=None,
- num_virtual_tokens=None,
- prompt_embedding=None) -> None:
- self.id = prompt_adapter_id
- self.prompt_embedding = prompt_embedding
- self.num_virtual_tokens = num_virtual_tokens
-
- @classmethod
- def from_local_checkpoint(
- cls,
- adapter_model_path: str,
- prompt_adapter_id: int,
- num_virtual_tokens: int,
- config: PromptAdapterConfig,
- device: str = "cuda",
- ) -> "PromptAdapterModel":
-
- if num_virtual_tokens > config.max_prompt_adapter_token:
- raise ValueError(
- f'num_virtual_tokens ({num_virtual_tokens}) should be <= '
- f'max_prompt_adapter_token({config.max_prompt_adapter_token})')
-
- adapters_weights = load_peft_weights(adapter_model_path, device)
- prompt_embedding = adapters_weights["prompt_embeddings"].to(
- config.prompt_adapter_dtype)
-
- return cls(prompt_adapter_id, num_virtual_tokens, prompt_embedding)
-
-
-class PromptAdapterModelManager(AdapterModelManager):
- """A manager that manages multiple Prompt Adapter models."""
-
- def __init__(
- self,
- model: nn.Module,
- max_num_seqs: int,
- max_num_batched_tokens: int,
- prompt_adapter_config: PromptAdapterConfig,
- ):
- """Create a PromptAdapterModel and adapter for a given model.
-
- Args:
- model: the model to be adapted.
- max_num_seqs: the maximum number of sequences model can run in a
- single batch.
- max_num_batched_tokens: the maximum number of tokens model can run
- in a single batch.
- prompt_adapter_config: the PromptAdapter config,
- """
- self.model: nn.Module = model
- # Dict instead of a Set for compatibility with LRUCache.
- self.prompt_adapter_index_to_id: List[
- Optional[int]] = [None] * self.prompt_adapter_slots
- self.max_num_seqs = max_num_seqs
- self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8
- self.prompt_adapter_config = prompt_adapter_config
- self.model.prompt_adapter_manager = self
- self.adapter_type = 'PromptAdapter'
-
- self.base_indices = torch.tensor([-1])
- self.base_embedding_indices = torch.tensor([])
-
- self.modules: Dict[str, nn.Module] = {}
- self._create_prompt_adapter_modules()
- self._last_mapping: Optional[PromptAdapterMapping] = None
-
- @property
- def prompt_adapter_slots(self) -> int:
- return self.prompt_adapter_config.max_prompt_adapters
-
- @property
- def adapter_slots(self) -> int:
- return self.prompt_adapter_slots
-
- @property
- def capacity(self) -> int:
- return self.prompt_adapter_config.max_cpu_prompt_adapters
-
- def activate_adapter(
- self,
- prompt_adapter_id: int,
- ) -> bool:
- """Move PromptAdapter into a GPU buffer
- to be used in the forward pass."""
- if prompt_adapter_id in self._active_adapters:
- return False
- first_free_slot = next(
- ((i, prompt_adapter_id) for i, prompt_adapter_id in enumerate(
- self.prompt_adapter_index_to_id) if prompt_adapter_id is None),
- None)
- if first_free_slot is None:
- raise ValueError("No free prompt_adapter slots")
- index, _ = first_free_slot
- self._active_adapters[prompt_adapter_id] = None
- prompt_adapter_model = (self._registered_adapters[prompt_adapter_id])
- logger.debug("Activating prompt_adapter. int id: %d, slot index: %d",
- prompt_adapter_model.id, index)
- self.prompt_adapter_index_to_id[index] = prompt_adapter_model.id
- for _, v in self.modules.items():
- v.set_prompt_adapter(index, prompt_adapter_model.prompt_embedding)
- return True
-
- def _deactivate_adapter(self, prompt_adapter_id: int):
- try:
- index = self.prompt_adapter_index_to_id.index(prompt_adapter_id)
- self.prompt_adapter_index_to_id[index] = None
- for _, v in self.modules.items():
- v.reset_prompt_adapter(index)
- except ValueError:
- pass
-
- def _add_adapter(self, prompt_adapter: PromptAdapterModel):
- self._registered_adapters[prompt_adapter.id] = prompt_adapter
-
- def _set_adapter_mapping(self, mapping: PromptAdapterMapping) -> None:
- base_indices, base_embedding_indices = convert_mapping(
- mapping, self.prompt_adapter_index_to_id)
- for k, v in self.modules.items():
- v.set_mapping(base_indices, base_embedding_indices)
-
- def _create_prompt_adapter_modules(self):
- for module_name, module in self.model.named_modules(
- remove_duplicate=False):
- if "VocabParallel" in module.__class__.__name__:
- new_module = VocabParallelEmbeddingWithPromptAdapter(module)
- new_module.create_prompt_adapter_weights(
- self.prompt_adapter_config)
- replaced_module = self.replace_submodule(
- self.model, module_name, new_module)
- self.register_module(module.__class__.__name__,
- replaced_module)
- replaced_module.set_mapping(self.base_indices,
- self.base_embedding_indices)
- break
-
- def replace_submodule(self, model: nn.Module, module_name: str,
- new_module: nn.Module) -> nn.Module:
- """Replace a submodule in a model with a new module."""
- parent = model.get_submodule(".".join(module_name.split(".")[:-1]))
- target_name = module_name.split(".")[-1]
- setattr(parent, target_name, new_module)
- return new_module
-
- def register_module(self, module_name: str, module: nn.Module):
- self.modules[module_name] = module
-
- def pin_adapter(self, prompt_adapter_id: int) -> bool:
- """Pin a PromptAdapterModel in the manager cache."""
- raise NotImplementedError(
- "Pinning is not supported in PromptAdapterModelManager. "
- "Use LRUCachePromptAdapterModelManager for pinning"
- ) # type: ignore
-
- def remove_all_adapters(self):
- """Remove all PromptAdapterModel from the manager."""
- self._registered_adapters.clear()
- self.prompt_adapter_index_to_id = [None] * self.prompt_adapter_slots
- self._active_adapters.clear()
-
- def deactivate_adapter(self, adapter_id: int) -> bool:
- return deactivate_adapter(adapter_id, self._active_adapters,
- self._deactivate_adapter)
-
- def add_adapter(self, adapter: PromptAdapterModel) -> bool:
- return add_adapter(adapter, self._registered_adapters, self.capacity,
- self._add_adapter)
-
- def set_adapter_mapping(self, mapping: PromptAdapterMapping) -> None:
- self._last_mapping = set_adapter_mapping(mapping, self._last_mapping,
- self._set_adapter_mapping)
-
- def remove_adapter(self, adapter_id: int) -> bool:
- return remove_adapter(adapter_id, self._registered_adapters,
- self.deactivate_adapter)
-
- def list_adapters(self) -> Dict[int, Any]:
- return list_adapters(self._registered_adapters)
-
- def get_adapter(self, adapter_id: int) -> Optional[Any]:
- return get_adapter(adapter_id, self._registered_adapters)
-
-
-class PromptAdapterLRUCache(AdapterLRUCache[PromptAdapterModel]):
-
- def __init__(self, capacity: int,
- deactivate_prompt_adapter_fn: Callable[[int], bool]):
- super().__init__(capacity, deactivate_prompt_adapter_fn)
-
-
-class LRUCachePromptAdapterModelManager(PromptAdapterModelManager):
- """A model manager that manages multiple prompt_adapters with LRU cache."""
-
- def __init__(
- self,
- model: nn.Module,
- max_num_seqs: int,
- max_num_batched_tokens: int,
- prompt_adapter_config: PromptAdapterConfig,
- ):
- self.prompt_adapter_config = prompt_adapter_config
- super().__init__(model, max_num_seqs, max_num_batched_tokens,
- prompt_adapter_config)
- self._registered_adapters = PromptAdapterLRUCache(
- self.capacity, self.deactivate_adapter)
- self._active_adapters = PromptAdapterLRUCache(
- self.prompt_adapter_slots, self._deactivate_adapter)
-
- def list_adapters(self) -> Dict[int, PromptAdapterModel]:
- """List all registered PromptAdapterModel."""
- return dict(self._registered_adapters.cache)
-
- def add_adapter(self, prompt_adapter: PromptAdapterModel) -> bool:
- """Add a PromptAdapterModel to the manager."""
- if prompt_adapter.id not in self._registered_adapters:
- self._add_adapter(prompt_adapter)
- was_added = True
- else:
- # We always touch to update the LRU cache order
- self._registered_adapters.touch(prompt_adapter.id)
- was_added = False
- return was_added
-
- def activate_adapter(
- self,
- prompt_adapter_id: int,
- ) -> bool:
- if prompt_adapter_id not in self._active_adapters and len(
- self._active_adapters) >= self.prompt_adapter_slots:
- self._active_adapters.remove_oldest()
- result = super().activate_adapter(prompt_adapter_id)
- # We always touch to update the LRU cache order
- self._active_adapters.touch(prompt_adapter_id)
- return result
-
- def remove_oldest_adapter(self) -> bool:
- if len(self._registered_adapters) > 0:
- self._registered_adapters.remove_oldest()
- return True
- return False
-
- def pin_adapter(self, prompt_adapter_id: int) -> bool:
- """Pin a PromptAdapterModel in the manager cache."""
- self._pin_prompt_adapter_in_cpu_cache(prompt_adapter_id)
- self._pin_prompt_adapter_in_gpu_cache(prompt_adapter_id)
- return True
-
- def _pin_prompt_adapter_in_cpu_cache(self, prompt_adapter_id: int):
- try:
- self._registered_adapters.pin(prompt_adapter_id)
- except ValueError as err:
- raise ValueError(
- "Pinning failed. "
- f"Prompt Adapter {prompt_adapter_id} is not registered."
- ) from err
-
- def _pin_prompt_adapter_in_gpu_cache(self, prompt_adapter_id: int):
- if prompt_adapter_id not in self._active_adapters:
- # move adapter to gpu if not already active
- self.activate_adapter(prompt_adapter_id)
- self._active_adapters.pin(prompt_adapter_id)
-
-
-def create_prompt_adapter_manager(
- model: nn.Module,
- max_num_seqs: int,
- max_num_batched_tokens: int,
- prompt_adapter_config: PromptAdapterConfig,
- prompt_adapter_manager_cls: Type[
- PromptAdapterModelManager] = PromptAdapterModelManager,
- **kwargs) -> PromptAdapterModelManager:
- """Create a PromptAdapterModel for a given model."""
- prompt_adapter_manager = prompt_adapter_manager_cls(
- model=model,
- max_num_seqs=max_num_seqs,
- max_num_batched_tokens=max_num_batched_tokens,
- prompt_adapter_config=prompt_adapter_config,
- **kwargs)
- return prompt_adapter_manager
diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py
deleted file mode 100644
index 3ce50d0a26b..00000000000
--- a/vllm/prompt_adapter/request.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import msgspec
-
-from vllm.adapter_commons.request import AdapterRequest
-
-
-class PromptAdapterRequest(
- msgspec.Struct,
- array_like=True, # type: ignore[call-arg]
- omit_defaults=True, # type: ignore[call-arg]
- frozen=True): # type: ignore[call-arg]
- """
- Request for a Prompt adapter.
- """
- __metaclass__ = AdapterRequest
-
- prompt_adapter_name: str
- prompt_adapter_id: int
- prompt_adapter_local_path: str
- prompt_adapter_num_virtual_tokens: int
-
- def __hash__(self):
- return super().__hash__()
-
- @property
- def adapter_id(self):
- return self.prompt_adapter_id
-
- @property
- def name(self):
- return self.prompt_adapter_name
-
- @property
- def local_path(self):
- return self.prompt_adapter_local_path
diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py
deleted file mode 100644
index ddd007868f6..00000000000
--- a/vllm/prompt_adapter/utils.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420
-
-import os
-from typing import Optional
-
-import torch
-from huggingface_hub import file_exists, hf_hub_download
-from huggingface_hub.utils import EntryNotFoundError
-from safetensors.torch import load_file as safe_load_file
-
-from vllm.platforms import current_platform
-
-WEIGHTS_NAME = "adapter_model.bin"
-SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors"
-
-
-# Get current device name based on available devices
-def infer_device() -> str:
- if current_platform.is_cuda_alike():
- return "cuda"
- return "cpu"
-
-
-def load_peft_weights(model_id: str,
- device: Optional[str] = None,
- **hf_hub_download_kwargs) -> dict:
- r"""
- A helper method to load the PEFT weights from the HuggingFace Hub or locally
-
- Args:
- model_id (`str`):
- The local path to the adapter weights or the name of the adapter to
- load from the HuggingFace Hub.
- device (`str`):
- The device to load the weights onto.
- hf_hub_download_kwargs (`dict`):
- Additional arguments to pass to the `hf_hub_download` method when
- loading from the HuggingFace Hub.
- """
- path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) if
- hf_hub_download_kwargs.get("subfolder") is not None else model_id)
-
- if device is None:
- device = infer_device()
-
- if os.path.exists(os.path.join(path, SAFETENSORS_WEIGHTS_NAME)):
- filename = os.path.join(path, SAFETENSORS_WEIGHTS_NAME)
- use_safetensors = True
- elif os.path.exists(os.path.join(path, WEIGHTS_NAME)):
- filename = os.path.join(path, WEIGHTS_NAME)
- use_safetensors = False
- else:
- token = hf_hub_download_kwargs.get("token")
- if token is None:
- token = hf_hub_download_kwargs.get("use_auth_token")
-
- hub_filename = (os.path.join(hf_hub_download_kwargs["subfolder"],
- SAFETENSORS_WEIGHTS_NAME)
- if hf_hub_download_kwargs.get("subfolder") is not None
- else SAFETENSORS_WEIGHTS_NAME)
- has_remote_safetensors_file = file_exists(
- repo_id=model_id,
- filename=hub_filename,
- revision=hf_hub_download_kwargs.get("revision"),
- repo_type=hf_hub_download_kwargs.get("repo_type"),
- token=token,
- )
- use_safetensors = has_remote_safetensors_file
-
- if has_remote_safetensors_file:
- # Priority 1: load safetensors weights
- filename = hf_hub_download(
- model_id,
- SAFETENSORS_WEIGHTS_NAME,
- **hf_hub_download_kwargs,
- )
- else:
- try:
- filename = hf_hub_download(model_id, WEIGHTS_NAME,
- **hf_hub_download_kwargs)
- except EntryNotFoundError:
- raise ValueError( # noqa: B904
- f"Can't find weights for {model_id} in {model_id} or \
- in the Hugging Face Hub. "
- f"Please check that the file {WEIGHTS_NAME} or \
- {SAFETENSORS_WEIGHTS_NAME} is present at {model_id}.")
-
- if use_safetensors:
- adapters_weights = safe_load_file(filename, device=device)
- else:
- adapters_weights = torch.load(filename,
- map_location=torch.device(device),
- weights_only=True)
-
- return adapters_weights
diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py
deleted file mode 100644
index 56265de8087..00000000000
--- a/vllm/prompt_adapter/worker_manager.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import logging
-from typing import Any, Optional, Set, Type
-
-import torch
-
-from vllm.adapter_commons.utils import (add_adapter_worker,
- apply_adapters_worker,
- list_adapters_worker,
- set_active_adapters_worker)
-from vllm.adapter_commons.worker_manager import AbstractWorkerManager
-from vllm.config import PromptAdapterConfig
-from vllm.prompt_adapter.models import (LRUCachePromptAdapterModelManager,
- PromptAdapterModel,
- PromptAdapterModelManager,
- create_prompt_adapter_manager)
-from vllm.prompt_adapter.request import PromptAdapterRequest
-
-logger = logging.getLogger(__name__)
-
-
-class WorkerPromptAdapterManager(AbstractWorkerManager):
- """WorkerPromptAdapterManager that manages
- prompt_adapter models on the worker side.
-
- Every request, the requested prompt_adapters will be
- loaded (unless they are already loaded),
- and every other prompt_adapter will be unloaded."""
-
- _manager_cls: Type[PromptAdapterModelManager] = PromptAdapterModelManager
-
- def __init__(
- self,
- max_num_seqs: int,
- max_num_batched_tokens: int,
- device: torch.device,
- prompt_adapter_config: PromptAdapterConfig,
- prompt_adapter_model_cls: Type[PromptAdapterModel] = PromptAdapterModel
- ):
- self._adapter_manager: PromptAdapterModelManager
- self.max_num_seqs = max_num_seqs
- self.max_num_batched_tokens = max_num_batched_tokens
- self._prompt_adapter_model_cls = prompt_adapter_model_cls
- self.prompt_adapter_config = prompt_adapter_config
- super().__init__(device)
-
- @property
- def is_enabled(self) -> bool:
- return True
-
- def create_prompt_adapter_manager(
- self,
- model: torch.nn.Module,
- ) -> Any:
- prompt_adapter_manager = create_prompt_adapter_manager(
- model,
- max_num_seqs=self.max_num_seqs,
- max_num_batched_tokens=self.max_num_batched_tokens,
- prompt_adapter_config=self.prompt_adapter_config,
- prompt_adapter_manager_cls=self._manager_cls,
- )
- self._adapter_manager = prompt_adapter_manager
- return prompt_adapter_manager.model
-
- def _load_adapter(
- self, prompt_adapter_request: PromptAdapterRequest
- ) -> PromptAdapterModel:
- try:
- prompt_adapter = (
- self._prompt_adapter_model_cls.from_local_checkpoint(
- prompt_adapter_request.prompt_adapter_local_path,
- prompt_adapter_id=prompt_adapter_request.prompt_adapter_id,
- num_virtual_tokens=prompt_adapter_request.
- prompt_adapter_num_virtual_tokens,
- config=self.prompt_adapter_config,
- device=str(self.device),
- ))
- except Exception as e:
- raise RuntimeError(
- f"Loading prompt_adapter "
- f"{prompt_adapter_request.prompt_adapter_local_path}"
- f" failed") from e
- return prompt_adapter
-
- def add_dummy_prompt_adapter(
- self, prompt_adapter_request: PromptAdapterRequest) -> bool:
- return True
-
- def pin_adapter(self, adapter_id: int) -> bool:
- return self._adapter_manager.pin_adapter(adapter_id)
-
- def set_active_adapters(self, requests: Set[Any],
- mapping: Optional[Any]) -> None:
- set_active_adapters_worker(requests, mapping, self._apply_adapters,
- self._adapter_manager.set_adapter_mapping)
-
- def add_adapter(self, adapter_request: Any) -> bool:
- return add_adapter_worker(adapter_request, self.list_adapters,
- self._load_adapter,
- self._adapter_manager.add_adapter,
- self._adapter_manager.activate_adapter)
-
- def _apply_adapters(self, adapter_requests: Set[Any]) -> None:
- apply_adapters_worker(adapter_requests, self.list_adapters,
- self._adapter_manager.adapter_slots,
- self.remove_adapter, self.add_adapter)
-
- def remove_adapter(self, adapter_id: int) -> bool:
- return self._adapter_manager.remove_adapter(adapter_id)
-
- def remove_all_adapters(self):
- self._adapter_manager.remove_all_adapters()
-
- def list_adapters(self) -> Set[int]:
- return list_adapters_worker(self._adapter_manager.list_adapters)
-
-
-class LRUCacheWorkerPromptAdapterManager(WorkerPromptAdapterManager):
- """WorkerPromptAdapterManager that manages
- prompt_adapter models on the worker side.
-
- Uses an LRU Cache. Every request, the requested
- prompt_adapters will be loaded (unless they are already loaded)
- and least recently used prompt_adapters will
- be unloaded if the cache is above capacity."""
-
- _prompt_adapter_manager_cls: Type[
- LRUCachePromptAdapterModelManager] = LRUCachePromptAdapterModelManager
-
- def create_prompt_adapter_manager(
- self,
- model: torch.nn.Module,
- ) -> Any:
- prompt_adapter_manager = create_prompt_adapter_manager(
- model,
- max_num_seqs=self.max_num_seqs,
- max_num_batched_tokens=self.max_num_batched_tokens,
- prompt_adapter_config=self.prompt_adapter_config,
- prompt_adapter_manager_cls=self._prompt_adapter_manager_cls)
- self._adapter_manager: LRUCachePromptAdapterModelManager = (
- prompt_adapter_manager)
- return prompt_adapter_manager.model
-
- def _apply_adapters(
- self, prompt_adapter_requests: Set[PromptAdapterRequest]) -> None:
- prompt_adapters_map = {
- prompt_adapter_request.prompt_adapter_id: prompt_adapter_request
- for prompt_adapter_request in prompt_adapter_requests
- if prompt_adapter_request
- }
- if len(prompt_adapters_map
- ) > self._adapter_manager.prompt_adapter_slots:
- raise RuntimeError(
- f"Number of requested prompt_adapters "
- f"({len(prompt_adapters_map)}) is greater "
- "than the number of GPU prompt_adapter slots "
- f"({self._adapter_manager.prompt_adapter_slots}).")
- for prompt_adapter in prompt_adapters_map.values():
- self.add_adapter(prompt_adapter)
-
- def add_adapter(self,
- prompt_adapter_request: PromptAdapterRequest) -> bool:
- if prompt_adapter_request.prompt_adapter_id not in self.list_adapters(
- ):
- # Remove before we load the new prompt_adapter to save memory
- if len(self._adapter_manager) + 1 > self._adapter_manager.capacity:
- self._adapter_manager.remove_oldest_adapter()
- prompt_adapter = self._load_adapter(prompt_adapter_request)
- loaded = self._adapter_manager.add_adapter(prompt_adapter)
- else:
- # If the prompt_adapter is already loaded, just touch it to
- # update its position in the caches
- loaded = self._adapter_manager.get_adapter(
- prompt_adapter_request.prompt_adapter_id) is not None
- self._adapter_manager.activate_adapter(
- prompt_adapter_request.prompt_adapter_id)
- return loaded
diff --git a/vllm/sequence.py b/vllm/sequence.py
index ffe890eb2da..ddeb89c87ac 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -19,7 +19,6 @@
from vllm.lora.request import LoRARequest
from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
from vllm.pooling_params import PoolingParams
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import RequestOutputKind, SamplingParams
VLLM_TOKEN_ID_ARRAY_TYPE = "l"
@@ -466,7 +465,6 @@ class Sequence:
block size used by the block manager and cache engine.
eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM.
lora_request: LoRA request.
- prompt_adapter_request: Prompt Adapter request.
"""
def __init__(
@@ -476,14 +474,12 @@ def __init__(
block_size: int,
eos_token_id: Optional[int] = None,
lora_request: Optional[LoRARequest] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> None:
self.seq_id = seq_id
self.inputs = inputs
self.block_size = block_size
self.eos_token_id = eos_token_id
self.lora_request = lora_request
- self.prompt_adapter_request = prompt_adapter_request
self.data = SequenceData.from_seqs(
self.prompt_token_ids,
@@ -545,11 +541,6 @@ def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
def lora_int_id(self) -> int:
return self.lora_request.lora_int_id if self.lora_request else 0
- @property
- def prompt_adapter_id(self) -> int:
- return self.prompt_adapter_request.prompt_adapter_id \
- if self.prompt_adapter_request else 0
-
def get_output_text_to_return(self, buffer_length: int,
delta: bool) -> str:
"""If delta is True, only new text since the last call to
@@ -609,12 +600,12 @@ def extra_hash(self) -> Optional[int]:
designed for prefix caching mode. The final sequence hash is determined
by applying token_ids from the sequence's blocks.
"""
- if self.prompt_adapter_id == 0 and self.lora_int_id == 0:
+ if self.lora_int_id == 0:
return None
# NOTE: If there are additional factors influencing the block aside from
# token_ids, include them as input parameters to the hash.
- return hash((self.prompt_adapter_id, self.lora_int_id))
+ return hash(self.lora_int_id)
def num_hashed_tokens_of_block(self, logical_idx: int):
return logical_idx * self.block_size + self.block_size
@@ -715,7 +706,6 @@ class SequenceGroup:
encoder_seq: Optional, the single encoder sequence. Should be None
unless you are working with an encoder/decoder model.
trace_headers: OpenTelemetry trace headers.
- prompt_adapter_request: Prompt Adapter request.
priority: User-defined priority of the request.
draft_size: The number of speculative tokens plus one from the target
model; equal to max number of tokens a step can generate
@@ -733,7 +723,6 @@ def __init__(self,
pooled_data: Optional[torch.Tensor] = None,
encoder_seq: Optional[Sequence] = None,
trace_headers: Optional[Mapping[str, str]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
draft_size: int = 1) -> None:
self.request_id = request_id
@@ -757,7 +746,6 @@ def __init__(self,
self.state = SequenceGroupState()
self.pooling_params = pooling_params
self.pooled_data = pooled_data
- self.prompt_adapter_request = prompt_adapter_request
self.encoder_seq = encoder_seq
self.trace_headers = trace_headers
self.priority = priority
@@ -812,16 +800,6 @@ def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
def lora_int_id(self) -> int:
return self.lora_request.lora_int_id if self.lora_request else 0
- @property
- def prompt_adapter_id(self) -> int:
- return self.prompt_adapter_request.prompt_adapter_id \
- if self.prompt_adapter_request else 0
-
- @property
- def prompt_adapter_num_virtual_tokens(self) -> int:
- return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens\
- if self.prompt_adapter_request else 0
-
def init_multi_step(self, num_steps: int) -> None:
self.state.num_steps = num_steps
self.state.current_step = 0
@@ -1021,7 +999,6 @@ class SequenceGroupMetadata(
(SequenceGroup.encoder_seq). Should be None
unless you are working with an encoder/decoder
model.
- prompt_adapter_request: Prompt Adapter request.
"""
request_id: str
@@ -1040,7 +1017,6 @@ class SequenceGroupMetadata(
multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
encoder_seq_data: Optional[SequenceData] = None
cross_block_table: Optional[list[int]] = None
- prompt_adapter_request: Optional[PromptAdapterRequest] = None
token_chunk_size: Optional[int] = None
### Stateful fields that are lazily defined. ###
@@ -1062,16 +1038,6 @@ def __post_init__(self):
def lora_int_id(self) -> int:
return self.lora_request.lora_int_id if self.lora_request else 0
- @property
- def prompt_adapter_id(self) -> int:
- return self.prompt_adapter_request.prompt_adapter_id \
- if self.prompt_adapter_request else 0
-
- @property
- def prompt_adapter_num_virtual_tokens(self) -> int:
- return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens \
- if self.prompt_adapter_request else 0
-
# Multi-Step Chunked-Prefill property
@property
def is_single_step_prompt(self) -> bool:
@@ -1524,7 +1490,6 @@ def add_request(request_id: str, engine, params, **kwargs):
pooled_data=seq_group.pooled_data,
encoder_seq=seq_group.encoder_seq,
trace_headers=seq_group.trace_headers,
- prompt_adapter_request=seq_group.prompt_adapter_request,
priority=seq_group.priority,
)
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 96646ec9471..876ae3b762a 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -137,7 +137,6 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest):
1. Only decodes
2. Only flash-attn
3. No LORA
- 4. No prompt_adapter_config
"""
if not allow_gpu_advance_step:
return False
@@ -152,11 +151,7 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest):
return False
# TODO: Add support for LORA
- if self.lora_config:
- return False
-
- # TODO: Add soft-tuning prompt adapter support
- return not self.prompt_adapter_config
+ return not self.lora_config
def set_indices_of_seq_with_bonus_tokens(self,
indices_of_seq_with_bonus_tokens):
@@ -199,9 +194,6 @@ def execute_model(
# Sanity
if self.lora_config is not None:
raise ValueError("TP1DraftModelRunner has no support for LORA")
- if self.prompt_adapter_config is not None:
- raise ValueError("TP1DraftModelRunner has no support for "
- "prompt_adapter_config")
if model_input.inputs_embeds is not None:
raise ValueError("TP1DraftModelRunner has no support for "
"inputs_embeds")
@@ -216,13 +208,6 @@ def execute_model(
self.set_active_loras(model_input.lora_requests,
model_input.lora_mapping)
- if self.prompt_adapter_config:
- assert model_input.prompt_adapter_requests is not None
- assert model_input.prompt_adapter_mapping is not None
- self.set_active_prompt_adapters(
- model_input.prompt_adapter_requests,
- model_input.prompt_adapter_mapping)
-
self.attn_state.begin_forward(model_input)
# Detect exec mode
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index c18f1d12ba9..140cb65f876 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -128,10 +128,6 @@
"backends currently supported with encoder/"
"decoder models.")
-STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER = ("Prompt adapters are not "
- "currently supported with encoder/"
- "decoder models.")
-
# Efficiently import all enc/dec error strings
# rather than having to import all of the above
STR_NOT_IMPL_ENC_DEC_ERR_STRS = {
@@ -145,7 +141,6 @@
"STR_NOT_IMPL_ENC_DEC_MM": STR_NOT_IMPL_ENC_DEC_MM,
"STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC,
"STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND,
- "STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER": STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER,
}
# Constants related to forcing the attention backend selection
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 3754570dfaa..ef472b62b43 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -19,7 +19,6 @@
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.pooling_params import PoolingParams
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.config import (
maybe_register_config_serialize_by_value)
@@ -219,7 +218,6 @@ async def add_request(
lora_request: Optional[LoRARequest] = None,
tokenization_kwargs: Optional[dict[str, Any]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> RequestOutputCollector:
@@ -236,8 +234,7 @@ async def add_request(
# Convert Input --> Request.
prompt_str, request = self.processor.process_inputs(
request_id, prompt, params, arrival_time, lora_request,
- tokenization_kwargs, trace_headers, prompt_adapter_request,
- priority, data_parallel_rank)
+ tokenization_kwargs, trace_headers, priority, data_parallel_rank)
if is_pooling or params.n == 1:
await self._add_request(request, prompt_str, None, 0, queue)
@@ -281,7 +278,6 @@ async def generate(
request_id: str,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> AsyncGenerator[RequestOutput, None]:
@@ -312,7 +308,6 @@ async def generate(
sampling_params,
lora_request=lora_request,
trace_headers=trace_headers,
- prompt_adapter_request=prompt_adapter_request,
priority=priority,
data_parallel_rank=data_parallel_rank,
)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index a2328c37ba0..bf5167f870b 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -17,7 +17,6 @@
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.pooling_params import PoolingParams
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import (
TokenizerGroup, init_tokenizer_from_configs)
@@ -189,7 +188,6 @@ def add_request(
lora_request: Optional[LoRARequest] = None,
tokenization_kwargs: Optional[dict[str, Any]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
) -> None:
# Validate the request_id type.
@@ -200,8 +198,7 @@ def add_request(
# Process raw inputs into the request.
prompt_str, request = self.processor.process_inputs(
request_id, prompt, params, arrival_time, lora_request,
- tokenization_kwargs, trace_headers, prompt_adapter_request,
- priority)
+ tokenization_kwargs, trace_headers, priority)
n = params.n if isinstance(params, SamplingParams) else 1
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 7af4ed54a22..6af3a194bb5 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -16,7 +16,6 @@
from vllm.multimodal.processing import EncDecMultiModalProcessor
from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
from vllm.pooling_params import PoolingParams
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
from vllm.v1.engine import EngineCoreRequest
@@ -226,7 +225,6 @@ def process_inputs(
lora_request: Optional[LoRARequest] = None,
tokenization_kwargs: Optional[dict[str, Any]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> tuple[Optional[str], EngineCoreRequest]:
@@ -237,8 +235,6 @@ def process_inputs(
self._validate_params(params, lora_request)
if trace_headers is not None:
raise ValueError("V1 does not support tracing yet.")
- if prompt_adapter_request is not None:
- raise ValueError("V1 does not support prompt_adapter_request.")
data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
if data_parallel_rank is not None and not (0 <= data_parallel_rank <
@@ -253,12 +249,10 @@ def process_inputs(
# 1. Tokenize text prompt, with LoRA request if one exists.
# 2. For multimodal models with a merged preprocessor, preprocess
# multimodal data and expand prompt token ids accordingly.
- # 3. Apply prompt adapter to prompt token ids if one exists.
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
prompt,
tokenization_kwargs=tokenization_kwargs,
lora_request=lora_request,
- prompt_adapter_request=prompt_adapter_request,
return_mm_hashes=self.use_hash,
)
from vllm.platforms import current_platform
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 97fec4704b4..c74d8c543f7 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -318,8 +318,6 @@ def report_usage_stats(
# Feature flags
"enable_lora":
bool(vllm_config.lora_config),
- "enable_prompt_adapter":
- bool(vllm_config.prompt_adapter_config),
"enable_prefix_caching":
vllm_config.cache_config.enable_prefix_caching,
"enforce_eager":
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index af216539c90..8abcf650599 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -102,7 +102,6 @@ def __init__(
self.parallel_config = vllm_config.parallel_config
self.scheduler_config = vllm_config.scheduler_config
self.speculative_config = vllm_config.speculative_config
- self.prompt_adapter_config = vllm_config.prompt_adapter_config
self.observability_config = vllm_config.observability_config
from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 83a80bd865b..dffe9cdb9ab 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -111,7 +111,6 @@ def __init__(
self.original_parallel_config = original_parallel_config
self.scheduler_config = vllm_config.scheduler_config
self.speculative_config = vllm_config.speculative_config
- self.prompt_adapter_config = vllm_config.prompt_adapter_config
self.observability_config = vllm_config.observability_config
self.device_config = vllm_config.device_config
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index c4bf40d6654..e9b1db3973c 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -61,7 +61,6 @@ def __init__(
self.scheduler_config = vllm_config.scheduler_config
self.device_config = vllm_config.device_config
self.speculative_config = vllm_config.speculative_config
- self.prompt_adapter_config = vllm_config.prompt_adapter_config
self.observability_config = vllm_config.observability_config
self.parallel_config.rank = rank
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 8d92edc5b38..cb5d5664ab5 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -91,10 +91,9 @@ def __init__(
'''
EncoderDecoderModelRunner constructor.
- `lora_config` and `prompt_adapter_config` are
- unused (since these features are not yet supported for encoder/decoder
- models) but these arguments are present here for compatibility with
- the base-class constructor.
+ `lora_config` is unused (since these features are not yet supported
+ for encoder/decoder models) but these arguments are present here for
+ compatibility with the base-class constructor.
'''
self._maybe_force_supported_attention_backend()
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 560110df0a3..4916a2a201f 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -24,7 +24,6 @@
from vllm.model_executor import set_random_seed
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.platforms import current_platform
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sequence import ExecuteModelRequest
from vllm.utils import bind_kv_cache
from vllm.worker.cache_engine import CacheEngine
@@ -367,23 +366,6 @@ def pin_lora(self, lora_id: int) -> bool:
def list_loras(self) -> Set[int]:
return self.model_runner.list_loras()
- def add_prompt_adapter(
- self, prompt_adapter_request: PromptAdapterRequest) -> bool:
- raise NotImplementedError(
- "Prompt Adapter is not implemented for HPU backend.")
-
- def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
- raise NotImplementedError(
- "Prompt Adapter is not implemented for HPU backend.")
-
- def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
- raise NotImplementedError(
- "Prompt Adapter is not implemented for HPU backend.")
-
- def list_prompt_adapters(self) -> Set[int]:
- raise NotImplementedError(
- "Prompt Adapter is not implemented for HPU backend.")
-
def shutdown_inc(self):
self.model_runner.shutdown_inc()
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index bced3ba9ba1..4bea37c8530 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -45,10 +45,6 @@
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
MultiModalKwargs, MultiModalPlaceholderMap,
MultiModalRegistry)
-from vllm.prompt_adapter.layers import PromptAdapterMapping
-from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.prompt_adapter.worker_manager import (
- LRUCacheWorkerPromptAdapterManager)
from vllm.sampling_params import SamplingParams
from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache,
@@ -95,8 +91,6 @@ class ModelInputForGPU(ModelRunnerInputBase):
lora_mapping: Optional["LoRAMapping"] = None
lora_requests: Optional[Set[LoRARequest]] = None
attn_metadata: Optional["AttentionMetadata"] = None
- prompt_adapter_mapping: Optional[PromptAdapterMapping] = None
- prompt_adapter_requests: Optional[Set[PromptAdapterRequest]] = None
multi_modal_kwargs: Optional[BatchedTensorInputs] = None
request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None
finished_requests_ids: Optional[List[str]] = None
@@ -113,8 +107,6 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
"lora_requests": self.lora_requests,
"lora_mapping": self.lora_mapping,
"multi_modal_kwargs": self.multi_modal_kwargs,
- "prompt_adapter_mapping": self.prompt_adapter_mapping,
- "prompt_adapter_requests": self.prompt_adapter_requests,
"virtual_engine": self.virtual_engine,
"request_ids_to_seq_ids": self.request_ids_to_seq_ids,
"finished_requests_ids": self.finished_requests_ids,
@@ -164,8 +156,6 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
"lora_requests": self.lora_requests,
"lora_mapping": self.lora_mapping,
"multi_modal_kwargs": self.multi_modal_kwargs,
- "prompt_adapter_mapping": self.prompt_adapter_mapping,
- "prompt_adapter_requests": self.prompt_adapter_requests,
"virtual_engine": self.virtual_engine,
"request_ids_to_seq_ids": self.request_ids_to_seq_ids,
"finished_requests_ids": self.finished_requests_ids,
@@ -212,8 +202,6 @@ def simple_reinit(self):
self.lora_index_mapping.clear() # type: ignore
self.lora_prompt_mapping.clear() # type: ignore
self.lora_requests.clear() # type: ignore
- self.prompt_adapter_index_mapping.clear() # type: ignore
- self.prompt_adapter_prompt_mapping.clear() # type: ignore
def __init__(
self,
@@ -252,11 +240,6 @@ def __init__(
lora_prompt_mapping: Optional[List[List[int]]] = None,
lora_requests: Optional[Set[LoRARequest]] = None,
- # Prompt adapter inputs.
- prompt_adapter_index_mapping: Optional[List[int]] = None,
- prompt_adapter_prompt_mapping: Optional[List[int]] = None,
- prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-
# Multi-modal inputs.
multi_modal_kwargs: Optional[MultiModalKwargs] = None,
multi_modal_placeholder_maps: Optional[Dict[
@@ -360,18 +343,6 @@ def __init__(
else:
self.lora_requests.clear()
- if prompt_adapter_index_mapping:
- self.prompt_adapter_index_mapping = \
- prompt_adapter_index_mapping
- else:
- self.prompt_adapter_index_mapping.clear()
-
- if prompt_adapter_prompt_mapping:
- self.prompt_adapter_prompt_mapping = \
- prompt_adapter_prompt_mapping
- else:
- self.prompt_adapter_prompt_mapping.clear()
-
else:
self.input_tokens = input_tokens or []
self.inputs_embeds = inputs_embeds
@@ -390,12 +361,6 @@ def __init__(
self.lora_prompt_mapping = lora_prompt_mapping or []
self.lora_requests = lora_requests or set()
- self.prompt_adapter_index_mapping = (
- prompt_adapter_index_mapping or [])
- self.prompt_adapter_prompt_mapping = (
- prompt_adapter_prompt_mapping or [])
-
- self.prompt_adapter_request = prompt_adapter_request
self.multi_modal_kwargs = multi_modal_kwargs
self.multi_modal_placeholder_maps = multi_modal_placeholder_maps
self.prefix_cache_hit = prefix_cache_hit
@@ -485,7 +450,6 @@ def __init__(self,
# Compute functions for each sequence group.
# WARNING: The order of the functions matters!
self.per_seq_group_compute_fns = [
- self._compute_prompt_adapter_input,
self._compute_multi_modal_input,
]
@@ -496,8 +460,6 @@ def __init__(self,
self.sliding_window = self.runner.sliding_window
self.block_size = self.runner.block_size
self.enable_lora = self.runner.lora_config is not None
- self.enable_prompt_adapter = (self.runner.prompt_adapter_config
- is not None)
# Attention metadata inputs.
if self.attn_backend is not None:
@@ -693,34 +655,6 @@ def _compute_lora_input(self, inter_data: InterDataForSeqGroup,
else:
inter_data.lora_prompt_mapping.append([])
- def _compute_prompt_adapter_input(
- self, inter_data: InterDataForSeqGroup,
- seq_group_metadata: SequenceGroupMetadata):
- """If prompt adapter is enabled, compute index and prompt mapping.
- """
- # Note that when is_prompt=True, we expect only one sequence
- # in the group.
- if not self.enable_prompt_adapter:
- return
-
- prompt_adapter_id = seq_group_metadata.prompt_adapter_id
- if prompt_adapter_id <= 0 or not inter_data.is_prompt:
- return
-
- # We expect only one sequence in the group when is_prompt=True.
- assert inter_data.n_seqs == 1
- query_len = inter_data.query_lens[0]
- inter_data.prompt_adapter_request = (
- seq_group_metadata.prompt_adapter_request)
-
- num_tokens = seq_group_metadata.prompt_adapter_num_virtual_tokens
- inter_data.prompt_adapter_index_mapping = [
- prompt_adapter_id
- ] * num_tokens + [0] * (query_len - num_tokens)
- inter_data.prompt_adapter_prompt_mapping = [prompt_adapter_id] * (
- query_len if seq_group_metadata.sampling_params
- and seq_group_metadata.sampling_params.prompt_logprobs else 1)
-
def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
seq_group_metadata: SequenceGroupMetadata):
"""If multi-modal data is given, add it to the input."""
@@ -1009,29 +943,6 @@ def build(self) -> ModelInputForGPU:
prompt_mapping=lora_prompt_mapping,
is_prefill=not self.decode_only))
- # Prompt adapter data.
- prompt_adapter_requests: Set[PromptAdapterRequest] = set()
- prompt_adapter_mapping = None
- if self.enable_prompt_adapter:
- prompt_adapter_requests = set(
- data.prompt_adapter_request for data in self.inter_data_list
- if data.prompt_adapter_request is not None)
- prompt_adapter_index_mapping = flatten_2d_lists([
- inter_data.prompt_adapter_index_mapping
- for inter_data in self.inter_data_list
- ])
- if cuda_graph_pad_size:
- prompt_adapter_index_mapping.extend(
- itertools.repeat(0, cuda_graph_pad_size))
- prompt_adapter_prompt_mapping = flatten_2d_lists([
- inter_data.prompt_adapter_prompt_mapping
- for inter_data in self.inter_data_list
- ])
- prompt_adapter_mapping = PromptAdapterMapping(
- prompt_adapter_index_mapping,
- prompt_adapter_prompt_mapping,
- )
-
# Multi-modal data.
multi_modal_kwargs_list = [
data.multi_modal_kwargs for data in self.inter_data_list
@@ -1051,9 +962,7 @@ def build(self) -> ModelInputForGPU:
lora_requests=lora_requests,
multi_modal_kwargs=multi_modal_kwargs,
request_ids_to_seq_ids=request_ids_to_seq_ids,
- finished_requests_ids=self.finished_requests_ids,
- prompt_adapter_mapping=prompt_adapter_mapping,
- prompt_adapter_requests=prompt_adapter_requests)
+ finished_requests_ids=self.finished_requests_ids)
class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
@@ -1148,7 +1057,6 @@ def __init__(
self.model: nn.Module # Set after load_model
# Set after load_model.
self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
- self.prompt_adapter_manager: LRUCacheWorkerPromptAdapterManager = None
self.sampler = get_sampler()
set_cpu_offload_max_bytes(
@@ -1207,14 +1115,7 @@ def load_model(self) -> None:
logger.info("Model loading took %.4f GiB and %.6f seconds",
self.model_memory_usage / GiB_bytes,
time_after_load - time_before_load)
- if self.prompt_adapter_config:
- self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager(
- self.scheduler_config.max_num_seqs,
- self.scheduler_config.max_num_batched_tokens, self.device,
- self.prompt_adapter_config)
- self.model = (
- self.prompt_adapter_manager.create_prompt_adapter_manager(
- self.model))
+
if self.vllm_config.compilation_config.level ==\
CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
@@ -1466,40 +1367,6 @@ def list_loras(self) -> Set[int]:
raise RuntimeError("LoRA is not enabled.")
return self.lora_manager.list_adapters()
- def remove_all_prompt_adapters(self):
- if not self.prompt_adapter_manager:
- raise RuntimeError("PromptAdapter is not enabled.")
- self.prompt_adapter_manager.remove_all_adapters()
-
- def set_active_prompt_adapters(
- self, prompt_adapter_requests: Set[PromptAdapterRequest],
- prompt_adapter_mapping: PromptAdapterMapping) -> None:
- if not self.prompt_adapter_manager:
- raise RuntimeError("PromptAdapter is not enabled.")
- self.prompt_adapter_manager.set_active_adapters(
- prompt_adapter_requests, prompt_adapter_mapping)
-
- def add_prompt_adapter(
- self, prompt_adapter_request: PromptAdapterRequest) -> bool:
- if not self.prompt_adapter_manager:
- raise RuntimeError("PromptAdapter is not enabled.")
- return self.prompt_adapter_manager.add_adapter(prompt_adapter_request)
-
- def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
- if not self.prompt_adapter_manager:
- raise RuntimeError("PromptAdapter is not enabled.")
- return self.prompt_adapter_manager.remove_adapter(prompt_adapter_id)
-
- def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
- if not self.prompt_adapter_manager:
- raise RuntimeError("PromptAdapter is not enabled.")
- return self.prompt_adapter_manager.pin_adapter(prompt_adapter_id)
-
- def list_prompt_adapters(self) -> Set[int]:
- if not self.prompt_adapter_manager:
- raise RuntimeError("PromptAdapter is not enabled.")
- return self.prompt_adapter_manager.list_adapters()
-
@torch.inference_mode()
def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
"""Cuda graph capture a model.
@@ -1609,13 +1476,6 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
self.set_active_loras(set([dummy_lora_request]),
lora_mapping)
- if self.prompt_adapter_config:
- prompt_adapter_mapping = PromptAdapterMapping(
- [-1] * batch_size,
- [-1] * batch_size,
- )
- self.set_active_prompt_adapters(
- set(), prompt_adapter_mapping)
graph_runner = CUDAGraphRunner(
self.model, self.attn_backend.get_name(),
self.attn_state.graph_clone(batch_size),
@@ -1776,13 +1636,6 @@ def execute_model(
self.set_active_loras(model_input.lora_requests,
model_input.lora_mapping)
- if self.prompt_adapter_config:
- assert model_input.prompt_adapter_requests is not None
- assert model_input.prompt_adapter_mapping is not None
- self.set_active_prompt_adapters(
- model_input.prompt_adapter_requests,
- model_input.prompt_adapter_mapping)
-
self.attn_state.begin_forward(model_input)
# Currently cuda graph is only supported by the decode phase.
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index d567ce4a6e7..2b31fe52483 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -188,7 +188,6 @@ def __init__(
self.scheduler_config = vllm_config.scheduler_config
self.device_config = vllm_config.device_config
self.speculative_config = vllm_config.speculative_config
- self.prompt_adapter_config = vllm_config.prompt_adapter_config
self.observability_config = vllm_config.observability_config
# Map of request_id -> generator used for seeded random sampling
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 0680e60b52a..2aa910bdff6 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -288,9 +288,6 @@ def maybe_advance_frozen_model_input(self, device: str, pin_memory: bool):
assert fmi.lora_requests is not None
assert len(fmi.lora_requests) == 0
assert fmi.attn_metadata is not None
- assert fmi.prompt_adapter_mapping is None
- assert fmi.prompt_adapter_requests is not None
- assert len(fmi.prompt_adapter_requests) == 0
assert fmi.multi_modal_kwargs is not None
assert len(fmi.multi_modal_kwargs) == 0
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index f80955f71a5..01bbdf645ae 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -63,13 +63,6 @@ def execute_model(
self.set_active_loras(model_input.lora_requests,
model_input.lora_mapping)
- if self.prompt_adapter_config:
- assert model_input.prompt_adapter_requests is not None
- assert model_input.prompt_adapter_mapping is not None
- self.set_active_prompt_adapters(
- model_input.prompt_adapter_requests,
- model_input.prompt_adapter_mapping)
-
# Currently cuda graph is only supported by the decode phase.
assert model_input.attn_metadata is not None
prefill_meta = model_input.attn_metadata.prefill_metadata
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index 1a5f62cb3c4..512a1dca737 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -47,7 +47,3 @@ def assert_enc_dec_mr_supported_scenario(
if enc_dec_mr.scheduler_config.num_lookahead_slots > 0:
raise NotImplementedError(
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SPEC_DEC'])
-
- if enc_dec_mr.prompt_adapter_config is not None:
- raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_ERR_STRS[
- 'STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER'])
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index b2926dbd185..5db7450fbf3 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -22,7 +22,6 @@
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from vllm.platforms import current_platform
-from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
SequenceGroupMetadata, SequenceGroupMetadataDelta)
from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache,
@@ -512,19 +511,6 @@ def pin_lora(self, lora_id: int) -> bool:
def list_loras(self) -> Set[int]:
return self.model_runner.list_loras()
- def add_prompt_adapter(
- self, prompt_adapter_request: PromptAdapterRequest) -> bool:
- return self.model_runner.add_prompt_adapter(prompt_adapter_request)
-
- def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
- return self.model_runner.remove_lora(prompt_adapter_id)
-
- def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
- return self.model_runner.pin_prompt_adapter(prompt_adapter_id)
-
- def list_prompt_adapters(self) -> Set[int]:
- return self.model_runner.list_prompt_adapters()
-
@property
def max_model_len(self) -> int:
return self.model_config.max_model_len
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index c382b29ad19..515ca91a4de 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -49,7 +49,6 @@ def __init__(
self.scheduler_config = vllm_config.scheduler_config
self.device_config = vllm_config.device_config
self.speculative_config = vllm_config.speculative_config
- self.prompt_adapter_config = vllm_config.prompt_adapter_config
self.observability_config = vllm_config.observability_config
self.kv_transfer_config = vllm_config.kv_transfer_config
self.compilation_config = vllm_config.compilation_config