From aca33c56d17d911c53b07ea2b8723c5bc5e98d81 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Thu, 17 Apr 2025 14:48:48 -0600 Subject: [PATCH 1/6] fix: n>1 logprobs streaming edge case Signed-off-by: Travis Johnson --- vllm/entrypoints/openai/serving_chat.py | 3 ++- vllm/entrypoints/openai/serving_completion.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 10aced83b60b..c0e89caf1400 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -573,7 +573,8 @@ async def chat_completion_stream_generator( if finish_reason_sent[i]: continue - if request.logprobs and request.top_logprobs is not None: + if request.logprobs and request.top_logprobs is not None \ + and output.token_ids: assert output.logprobs is not None, ( "Did not output logprobs") logprobs = self._create_chat_logprobs( diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index a19fde8d70a8..cbd20c084cf1 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -358,7 +358,7 @@ async def completion_stream_generator( # Chunked prefill case, don't return empty chunks continue - if request.logprobs is not None: + if request.logprobs is not None and output.token_ids: assert out_logprobs is not None, ( "Did not output logprobs") logprobs = self._create_completion_logprobs( From 52594bc3433cf7d58e1bd62ffab0b82e153b73e8 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Mon, 21 Apr 2025 11:30:27 -0600 Subject: [PATCH 2/6] refactor: set up tests/regressions Signed-off-by: Travis Johnson --- .buildkite/test-pipeline.yaml | 5 ++--- tests/regressions/__init__.py | 0 .../test_llm_regression.py} | 0 3 files changed, 2 insertions(+), 3 deletions(-) create mode 100644 tests/regressions/__init__.py rename tests/{test_regression.py => regressions/test_llm_regression.py} (100%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d6c9ee680abf..38fe31eea67c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -191,11 +191,10 @@ steps: mirror_hardwares: [amdexperimental, amdproduction] source_file_dependencies: - vllm/ - - tests/test_regression + - tests/regressions commands: - pip install modelscope - - pytest -v -s test_regression.py - working_dir: "/vllm-workspace/tests" # optional + - pytest -v -s regressions - label: Engine Test # 10min mirror_hardwares: [amdexperimental, amdproduction] diff --git a/tests/regressions/__init__.py b/tests/regressions/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/test_regression.py b/tests/regressions/test_llm_regression.py similarity index 100% rename from tests/test_regression.py rename to tests/regressions/test_llm_regression.py From 879d615b325e5bb4eefdb8c56a072f05630e46d2 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Tue, 22 Apr 2025 09:38:08 -0600 Subject: [PATCH 3/6] test: add openai request regression tests Signed-off-by: Travis Johnson --- tests/regressions/test_openai_requests.py | 115 ++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 tests/regressions/test_openai_requests.py diff --git a/tests/regressions/test_openai_requests.py b/tests/regressions/test_openai_requests.py new file mode 100644 index 000000000000..2869baccda85 --- /dev/null +++ b/tests/regressions/test_openai_requests.py @@ -0,0 +1,115 @@ +# SPDX-License-Identifier: Apache-2.0 + +# imports for guided decoding tests +from itertools import chain + +import openai # use the official client for correctness check +import pytest +import pytest_asyncio +# downloading lora to test lora requests +from openai.types import Completion + +from ..utils import RemoteOpenAIServer + +MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" + + +@pytest.fixture(scope="module") +def default_server_args(): + return [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--max-num-seqs", + "128", + "--enforce-eager", + ] + + +@pytest.fixture(scope="module") +def server(default_server_args): + with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture() +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +async def test_multiseq_logprobs_streaming(client: openai.AsyncOpenAI): + """Edge case request combining multiple functionalities + + https://github.com/vllm-project/vllm/pull/15259 + https://github.com/vllm-project/vllm/pull/16805 + """ + + # completions + stream = await client.completions.create( + model=MODEL_NAME, + prompt="1 2 3 4 5", + max_tokens=3, + # include usage chunk to make sure the stream is complete + stream_options={"include_usage": True}, + stream=True, + n=2, + logprobs=0, # include 1-top logprob per generated token + temperature=1.0) + + n0_chunks: list[Completion] = [] + n1_chunks: list[Completion] = [] + usage_chunk: Completion = None + async for chunk in stream: + print(chunk) + if choices := chunk.choices: + assert len(choices) == 1, \ + (f"Streamed chunk had {len(choices)} choices, when only 1 was" + " expected") + choice = choices[0] + if choice.index == 0: + n0_chunks.append(chunk) + elif choice.index == 1: + n1_chunks.append(chunk) + else: + raise AssertionError(f"Unexpected choice index {choice.index}") + + elif chunk.usage is not None: + usage_chunk = chunk + + else: + raise AssertionError(f"Unexpected chunk {chunk}") + + # check that we got the requested number of tokens + assert sum( + len(chunk.choices[0].logprobs.tokens) for chunk in n0_chunks + if chunk.choices[0].logprobs + ) == 3, "Streamed response did not have the expected number of tokens." + assert sum( + len(chunk.choices[0].logprobs.tokens) for chunk in n1_chunks + if chunk.choices[0].logprobs + ) == 3, "Streamed response did not have the expected number of tokens." + + # check 1 logprob per token/chunk + for chunk in chain(n0_chunks, n1_chunks): + # a finish chunk may not have any text/logprobs + # V0 does not + # V1 does + choice = chunk.choices[0] + if choice.logprobs is None: + assert choice.finish_reason + assert choice.text == '' + continue + + assert choice.logprobs.top_logprobs + for top_logprobs in choice.logprobs.top_logprobs: + assert len(top_logprobs) == 1 + + # requested usage + assert usage_chunk is not None + assert usage_chunk.usage.completion_tokens == 6 + assert usage_chunk.usage.prompt_tokens == 9 + assert usage_chunk.usage.total_tokens == 15 From 47e4388f0cd73cddd4124a251237136397290186 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Tue, 22 Apr 2025 09:40:54 -0600 Subject: [PATCH 4/6] fix: stream all chunks for multiseq Signed-off-by: Travis Johnson --- vllm/sequence.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index ffe890eb2dab..e3f1f79edb31 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1534,12 +1534,11 @@ def add_request(request_id: str, engine, params, **kwargs): def maybe_assemble_group( self, seq_group: SequenceGroup) -> Optional[SequenceGroup]: - # in the streaming mode, we will return the assembled sequence - # for the first remaining sequence, and then return None for the - # rest of sequences + # in the streaming mode, we will return the assembled sequence for the + # last remaining sequence, and return None for the rest of sequences if self.streaming: - first_remaining_id = next(iter(self.to_be_finished)) - if seq_group.request_id == first_remaining_id: + last_remaining_id = list(self.to_be_finished)[-1] + if seq_group.request_id == last_remaining_id: return self.assembled_seq_group return None From ecbbc1f12d3b138a882982f8fd78a9a57467a0a9 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Tue, 22 Apr 2025 09:54:07 -0600 Subject: [PATCH 5/6] test: run regressions for V0 too Signed-off-by: Travis Johnson --- .buildkite/test-pipeline.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 38fe31eea67c..efdab33319fc 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -195,6 +195,7 @@ steps: commands: - pip install modelscope - pytest -v -s regressions + - VLLM_USE_V1=0 pytest -v -s regressions - label: Engine Test # 10min mirror_hardwares: [amdexperimental, amdproduction] From 57578c9d3503a7b1374d62c079ae6061dcb43591 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Thu, 15 May 2025 11:14:42 -0600 Subject: [PATCH 6/6] refactor: try a different fix Signed-off-by: Travis Johnson --- vllm/sequence.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index e3f1f79edb31..9ebfea3eb8f3 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1534,17 +1534,17 @@ def add_request(request_id: str, engine, params, **kwargs): def maybe_assemble_group( self, seq_group: SequenceGroup) -> Optional[SequenceGroup]: - # in the streaming mode, we will return the assembled sequence for the - # last remaining sequence, and return None for the rest of sequences - if self.streaming: - last_remaining_id = list(self.to_be_finished)[-1] - if seq_group.request_id == last_remaining_id: + # in the streaming mode, we will return the assembled sequence while + # sequences are still processing, but must choose only one of the + # remaining sequences + if self.streaming and not seq_group.is_finished(): + first_remaining_id = next(iter(self.to_be_finished)) + if seq_group.request_id == first_remaining_id: return self.assembled_seq_group return None - # in the non-streaming mode, we will return the assembled sequence - # when the last sequences finishes, and then return None for the - # rest of the time + # for non-streaming and when finishing streaming, we will return the + # assembled sequence when the last sequence finishes if (len(self.to_be_finished) == 1 and seq_group.request_id in self.to_be_finished and seq_group.is_finished()):