From aca33c56d17d911c53b07ea2b8723c5bc5e98d81 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Thu, 17 Apr 2025 14:48:48 -0600
Subject: [PATCH 1/6] fix: n>1 logprobs streaming edge case

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/entrypoints/openai/serving_chat.py       | 3 ++-
 vllm/entrypoints/openai/serving_completion.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 10aced83b60b..c0e89caf1400 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -573,7 +573,8 @@ async def chat_completion_stream_generator(
                     if finish_reason_sent[i]:
                         continue
 
-                    if request.logprobs and request.top_logprobs is not None:
+                    if request.logprobs and request.top_logprobs is not None \
+                        and output.token_ids:
                         assert output.logprobs is not None, (
                             "Did not output logprobs")
                         logprobs = self._create_chat_logprobs(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index a19fde8d70a8..cbd20c084cf1 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -358,7 +358,7 @@ async def completion_stream_generator(
                             # Chunked prefill case, don't return empty chunks
                             continue
 
-                    if request.logprobs is not None:
+                    if request.logprobs is not None and output.token_ids:
                         assert out_logprobs is not None, (
                             "Did not output logprobs")
                         logprobs = self._create_completion_logprobs(

From 52594bc3433cf7d58e1bd62ffab0b82e153b73e8 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Mon, 21 Apr 2025 11:30:27 -0600
Subject: [PATCH 2/6] refactor: set up tests/regressions

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 .buildkite/test-pipeline.yaml                                | 5 ++---
 tests/regressions/__init__.py                                | 0
 .../test_llm_regression.py}                                  | 0
 3 files changed, 2 insertions(+), 3 deletions(-)
 create mode 100644 tests/regressions/__init__.py
 rename tests/{test_regression.py => regressions/test_llm_regression.py} (100%)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d6c9ee680abf..38fe31eea67c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -191,11 +191,10 @@ steps:
   mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
   - vllm/
-  - tests/test_regression
+  - tests/regressions
   commands:
   - pip install modelscope
-  - pytest -v -s test_regression.py
-  working_dir: "/vllm-workspace/tests" # optional
+  - pytest -v -s regressions
 
 - label: Engine Test # 10min
   mirror_hardwares: [amdexperimental, amdproduction]
diff --git a/tests/regressions/__init__.py b/tests/regressions/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/test_regression.py b/tests/regressions/test_llm_regression.py
similarity index 100%
rename from tests/test_regression.py
rename to tests/regressions/test_llm_regression.py

From 879d615b325e5bb4eefdb8c56a072f05630e46d2 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Tue, 22 Apr 2025 09:38:08 -0600
Subject: [PATCH 3/6] test: add openai request regression tests

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 tests/regressions/test_openai_requests.py | 115 ++++++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 tests/regressions/test_openai_requests.py

diff --git a/tests/regressions/test_openai_requests.py b/tests/regressions/test_openai_requests.py
new file mode 100644
index 000000000000..2869baccda85
--- /dev/null
+++ b/tests/regressions/test_openai_requests.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# imports for guided decoding tests
+from itertools import chain
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+# downloading lora to test lora requests
+from openai.types import Completion
+
+from ..utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(default_server_args):
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture()
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_multiseq_logprobs_streaming(client: openai.AsyncOpenAI):
+    """Edge case request combining multiple functionalities
+
+    https://github.com/vllm-project/vllm/pull/15259
+    https://github.com/vllm-project/vllm/pull/16805
+    """
+
+    # completions
+    stream = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="1 2 3 4 5",
+        max_tokens=3,
+        # include usage chunk to make sure the stream is complete
+        stream_options={"include_usage": True},
+        stream=True,
+        n=2,
+        logprobs=0,  # include 1-top logprob per generated token
+        temperature=1.0)
+
+    n0_chunks: list[Completion] = []
+    n1_chunks: list[Completion] = []
+    usage_chunk: Completion = None
+    async for chunk in stream:
+        print(chunk)
+        if choices := chunk.choices:
+            assert len(choices) == 1, \
+                (f"Streamed chunk had {len(choices)} choices, when only 1 was"
+                 " expected")
+            choice = choices[0]
+            if choice.index == 0:
+                n0_chunks.append(chunk)
+            elif choice.index == 1:
+                n1_chunks.append(chunk)
+            else:
+                raise AssertionError(f"Unexpected choice index {choice.index}")
+
+        elif chunk.usage is not None:
+            usage_chunk = chunk
+
+        else:
+            raise AssertionError(f"Unexpected chunk {chunk}")
+
+    # check that we got the requested number of tokens
+    assert sum(
+        len(chunk.choices[0].logprobs.tokens) for chunk in n0_chunks
+        if chunk.choices[0].logprobs
+    ) == 3, "Streamed response did not have the expected number of tokens."
+    assert sum(
+        len(chunk.choices[0].logprobs.tokens) for chunk in n1_chunks
+        if chunk.choices[0].logprobs
+    ) == 3, "Streamed response did not have the expected number of tokens."
+
+    # check 1 logprob per token/chunk
+    for chunk in chain(n0_chunks, n1_chunks):
+        # a finish chunk may not have any text/logprobs
+        # V0 does not
+        # V1 does
+        choice = chunk.choices[0]
+        if choice.logprobs is None:
+            assert choice.finish_reason
+            assert choice.text == ''
+            continue
+
+        assert choice.logprobs.top_logprobs
+        for top_logprobs in choice.logprobs.top_logprobs:
+            assert len(top_logprobs) == 1
+
+    # requested usage
+    assert usage_chunk is not None
+    assert usage_chunk.usage.completion_tokens == 6
+    assert usage_chunk.usage.prompt_tokens == 9
+    assert usage_chunk.usage.total_tokens == 15

From 47e4388f0cd73cddd4124a251237136397290186 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Tue, 22 Apr 2025 09:40:54 -0600
Subject: [PATCH 4/6] fix: stream all chunks for multiseq

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/sequence.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/vllm/sequence.py b/vllm/sequence.py
index ffe890eb2dab..e3f1f79edb31 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1534,12 +1534,11 @@ def add_request(request_id: str, engine, params, **kwargs):
     def maybe_assemble_group(
             self, seq_group: SequenceGroup) -> Optional[SequenceGroup]:
 
-        # in the streaming mode, we will return the assembled sequence
-        # for the first remaining sequence, and then return None for the
-        # rest of sequences
+        # in the streaming mode, we will return the assembled sequence for the
+        # last remaining sequence, and return None for the rest of sequences
         if self.streaming:
-            first_remaining_id = next(iter(self.to_be_finished))
-            if seq_group.request_id == first_remaining_id:
+            last_remaining_id = list(self.to_be_finished)[-1]
+            if seq_group.request_id == last_remaining_id:
                 return self.assembled_seq_group
             return None
 

From ecbbc1f12d3b138a882982f8fd78a9a57467a0a9 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Tue, 22 Apr 2025 09:54:07 -0600
Subject: [PATCH 5/6] test: run regressions for V0 too

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 .buildkite/test-pipeline.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 38fe31eea67c..efdab33319fc 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -195,6 +195,7 @@ steps:
   commands:
   - pip install modelscope
   - pytest -v -s regressions
+  - VLLM_USE_V1=0 pytest -v -s regressions
 
 - label: Engine Test # 10min
   mirror_hardwares: [amdexperimental, amdproduction]

From 57578c9d3503a7b1374d62c079ae6061dcb43591 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Thu, 15 May 2025 11:14:42 -0600
Subject: [PATCH 6/6] refactor: try a different fix

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 vllm/sequence.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/sequence.py b/vllm/sequence.py
index e3f1f79edb31..9ebfea3eb8f3 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1534,17 +1534,17 @@ def add_request(request_id: str, engine, params, **kwargs):
     def maybe_assemble_group(
             self, seq_group: SequenceGroup) -> Optional[SequenceGroup]:
 
-        # in the streaming mode, we will return the assembled sequence for the
-        # last remaining sequence, and return None for the rest of sequences
-        if self.streaming:
-            last_remaining_id = list(self.to_be_finished)[-1]
-            if seq_group.request_id == last_remaining_id:
+        # in the streaming mode, we will return the assembled sequence while
+        # sequences are still processing, but must choose only one of the
+        # remaining sequences
+        if self.streaming and not seq_group.is_finished():
+            first_remaining_id = next(iter(self.to_be_finished))
+            if seq_group.request_id == first_remaining_id:
                 return self.assembled_seq_group
             return None
 
-        # in the non-streaming mode, we will return the assembled sequence
-        # when the last sequences finishes, and then return None for the
-        # rest of the time
+        # for non-streaming and when finishing streaming, we will return the
+        # assembled sequence when the last sequence finishes
         if (len(self.to_be_finished) == 1
                 and seq_group.request_id in self.to_be_finished
                 and seq_group.is_finished()):