compare results naive

sducouedic · sducouedic · commit ac373c00eef3 · 2025-07-23T15:22:20.000Z
Signed-off-by: Sophie du Couédic &lt;sop@zurich.ibm.com&gt;
diff --git a/tests/e2e/test_spyre_cb_scheduler_steps.py b/tests/e2e/test_spyre_cb_scheduler_steps.py
@@ -8,7 +8,8 @@
 
 import pytest
 from scheduling_utils import check_scheduler_inference_steps
-from spyre_util import get_spyre_backend_list, get_spyre_model_list
+from spyre_util import (compare_results, generate_hf_output,
+                        get_spyre_backend_list, get_spyre_model_list)
 
 
 @pytest.mark.cb
@@ -34,6 +35,8 @@ def test_prompts_aligned_with_tkv_boundaries(model: str, backend: str,
     available_blocks = -1  # no restriction
     max_num_seqs = 2
     max_model_len = 256
+    # check_output = backend == "sendnn"
+    check_output = True
 
     checked_steps = [
         {
@@ -162,7 +165,7 @@ def test_prompts_aligned_with_tkv_boundaries(model: str, backend: str,
         },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -174,8 +177,22 @@ def test_prompts_aligned_with_tkv_boundaries(model: str, backend: str,
         max_model_len=max_model_len,
         available_blocks=available_blocks,
         use_cb=True,
+        collect_outputs=check_output,
     )
 
+    if check_output:
+        hf_outputs = generate_hf_output(
+            model=model,
+            prompts=prompts,
+            max_new_tokens=seqs_max_tokens,
+            ignore_eos=True,
+        )
+        compare_results(model=model,
+                        tensor_parallel_size=1,
+                        backend=backend,
+                        vllm_results=cb_outputs,
+                        hf_results=hf_outputs)
+
 
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
@@ -200,6 +217,8 @@ def test_prompts_misaligned_with_tkv_boundaries(
     available_blocks = -1  # no restriction
     max_num_seqs = 2
     max_model_len = 256
+    # check_output = backend == "sendnn"
+    check_output = True
 
     checked_steps = [
         {
@@ -326,7 +345,7 @@ def test_prompts_misaligned_with_tkv_boundaries(
         },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -338,8 +357,22 @@ def test_prompts_misaligned_with_tkv_boundaries(
         max_model_len=max_model_len,
         available_blocks=available_blocks,
         use_cb=True,
+        collect_outputs=check_output,
     )
 
+    if check_output:
+        hf_outputs = generate_hf_output(
+            model=model,
+            prompts=prompts,
+            max_new_tokens=seqs_max_tokens,
+            ignore_eos=True,
+        )
+        compare_results(model=model,
+                        tensor_parallel_size=1,
+                        backend=backend,
+                        vllm_results=cb_outputs,
+                        hf_results=hf_outputs)
+
 
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
@@ -363,6 +396,8 @@ def test_two_sequences_finish_same_time_as_new_arrive(
     available_blocks = -1  # no restriction
     max_num_seqs = 2
     max_model_len = 256
+    # check_output = backend == "sendnn"
+    check_output = True
 
     checked_steps = [
         {
@@ -466,7 +501,7 @@ def test_two_sequences_finish_same_time_as_new_arrive(
         },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -478,8 +513,22 @@ def test_two_sequences_finish_same_time_as_new_arrive(
         max_model_len=max_model_len,
         available_blocks=available_blocks,
         use_cb=True,
+        collect_outputs=check_output,
     )
 
+    if check_output:
+        hf_outputs = generate_hf_output(
+            model=model,
+            prompts=prompts,
+            max_new_tokens=seqs_max_tokens,
+            ignore_eos=True,
+        )
+        compare_results(model=model,
+                        tensor_parallel_size=1,
+                        backend=backend,
+                        vllm_results=cb_outputs,
+                        hf_results=hf_outputs)
+
 
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
@@ -504,6 +553,8 @@ def test_new_sequence_joins_during_decode(model: str, backend: str,
     available_blocks = -1  # no restriction
     max_num_seqs = 4
     max_model_len = 256
+    # check_output = backend == "sendnn"
+    check_output = True
 
     checked_steps = [
         {
@@ -729,7 +780,7 @@ def test_new_sequence_joins_during_decode(model: str, backend: str,
         # },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -741,8 +792,22 @@ def test_new_sequence_joins_during_decode(model: str, backend: str,
         max_model_len=max_model_len,
         available_blocks=available_blocks,
         use_cb=True,
+        collect_outputs=check_output,
     )
 
+    if check_output:
+        hf_outputs = generate_hf_output(
+            model=model,
+            prompts=prompts,
+            max_new_tokens=seqs_max_tokens,
+            ignore_eos=True,
+        )
+        compare_results(model=model,
+                        tensor_parallel_size=1,
+                        backend=backend,
+                        vllm_results=cb_outputs,
+                        hf_results=hf_outputs)
+
 
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
@@ -764,6 +829,7 @@ def test_prompt_too_long_for_current_tkv(model: str, backend: str,
     available_blocks = -1  # no restriction
     max_num_seqs = 2
     max_model_len = 256
+    check_output = False
 
     checked_steps = [
         {
@@ -878,7 +944,7 @@ def test_prompt_too_long_for_current_tkv(model: str, backend: str,
         },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -890,15 +956,30 @@ def test_prompt_too_long_for_current_tkv(model: str, backend: str,
         max_model_len=max_model_len,
         available_blocks=available_blocks,
         use_cb=True,
+        collect_outputs=check_output,
     )
 
+    if check_output:
+        hf_outputs = generate_hf_output(
+            model=model,
+            prompts=prompts,
+            max_new_tokens=seqs_max_tokens,
+            ignore_eos=True,
+        )
+        compare_results(model=model,
+                        tensor_parallel_size=1,
+                        backend=backend,
+                        vllm_results=cb_outputs,
+                        hf_results=hf_outputs)
+
 
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 def test_requested_tokens_not_fitting_remaining_space(
         model: str, backend: str, monkeypatch: pytest.MonkeyPatch):
-    """ Scenario where the request goes beyond max_model_len 
+    """ Scenario where the request goes beyond max_model_len and needs to wait
+    for a new batch.
 
     Configuration:
         * max_num_seqs: 2
@@ -914,6 +995,7 @@ def test_requested_tokens_not_fitting_remaining_space(
     available_blocks = -1  # no restriction
     max_num_seqs = 2
     max_model_len = 256
+    check_output = False
 
     checked_steps = [
         {
@@ -1065,7 +1147,7 @@ def test_requested_tokens_not_fitting_remaining_space(
         },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -1077,8 +1159,22 @@ def test_requested_tokens_not_fitting_remaining_space(
         max_model_len=max_model_len,
         available_blocks=available_blocks,
         use_cb=True,
+        collect_outputs=check_output,
     )
 
+    if check_output:
+        hf_outputs = generate_hf_output(
+            model=model,
+            prompts=prompts,
+            max_new_tokens=seqs_max_tokens,
+            ignore_eos=True,
+        )
+        compare_results(model=model,
+                        tensor_parallel_size=1,
+                        backend=backend,
+                        vllm_results=cb_outputs,
+                        hf_results=hf_outputs)
+
 
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
@@ -1104,6 +1200,8 @@ def test_requests_use_all_available_blocks(model: str, backend: str,
     available_blocks = 8
     max_num_seqs = 4
     max_model_len = 256
+    # check_output = backend == "sendnn"
+    check_output = True
 
     checked_steps = [
         {
@@ -1199,7 +1297,7 @@ def test_requests_use_all_available_blocks(model: str, backend: str,
         },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -1211,8 +1309,22 @@ def test_requests_use_all_available_blocks(model: str, backend: str,
         max_model_len=max_model_len,
         available_blocks=available_blocks,
         use_cb=True,
+        collect_outputs=check_output,
     )
 
+    if check_output:
+        hf_outputs = generate_hf_output(
+            model=model,
+            prompts=prompts,
+            max_new_tokens=seqs_max_tokens,
+            ignore_eos=True,
+        )
+        compare_results(model=model,
+                        tensor_parallel_size=1,
+                        backend=backend,
+                        vllm_results=cb_outputs,
+                        hf_results=hf_outputs)
+
 
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
@@ -1239,6 +1351,8 @@ def test_requests_use_more_than_available_blocks(
     available_blocks = 4
     max_num_seqs = 4
     max_model_len = 256
+    # check_output = backend == "sendnn"
+    check_output = True
 
     checked_steps = [
         {
@@ -1359,7 +1473,7 @@ def test_requests_use_more_than_available_blocks(
         },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -1371,4 +1485,18 @@ def test_requests_use_more_than_available_blocks(
         max_model_len=max_model_len,
         available_blocks=available_blocks,
         use_cb=True,
+        collect_outputs=check_output,
     )
+
+    if check_output:
+        hf_outputs = generate_hf_output(
+            model=model,
+            prompts=prompts,
+            max_new_tokens=seqs_max_tokens,
+            ignore_eos=True,
+        )
+        compare_results(model=model,
+                        tensor_parallel_size=1,
+                        backend=backend,
+                        vllm_results=cb_outputs,
+                        hf_results=hf_outputs)
diff --git a/tests/scheduling_utils.py b/tests/scheduling_utils.py
@@ -87,9 +87,8 @@ def check_scheduler_inference_steps(
             "List of checked steps needs to be of increasing order of step")
     # ------
 
-    collected_outputs = defaultdict(lambda: {"tokens_ids": [], "logprobs": []})
+    collected_outputs = defaultdict(lambda: {"token_ids": [], "logprobs": []})
     generated_prompts = []
-    prompts_sampling_params = []
 
     # Setup the engine
     engine_args = EngineArgs(model=model,
@@ -122,7 +121,6 @@ def check_scheduler_inference_steps(
                                         model=model)
         requests.append((add_step, request))
         generated_prompts.append(request.prompt_token_ids)
-        prompts_sampling_params.append(sampling_params)
 
     # In-between steps are added as normal decode steps
     checked_steps = augment_checked_steps(checked_steps)
@@ -202,16 +200,25 @@ def check_scheduler_inference_steps(
                 new_logprobs = output.new_logprobs.logprobs
                 assert len(new_token_ids) == 1 and len(new_logprobs) == 1
 
-                collected_outputs[output.request_id]["tokens_ids"].append(
+                collected_outputs[output.request_id]["token_ids"].append(
                     new_token_ids[0])
                 collected_outputs[output.request_id]["logprobs"].append(
                     new_logprobs[0][0])
 
     # Return collected outputs as list
     if not collected_outputs:
-        return [], generated_prompts, prompts_sampling_params
+        return [], generated_prompts
     else:
         output_keys = sorted(int(k) for k in collected_outputs)
         assert output_keys[0] == 0 and output_keys[-1] == len(output_keys) - 1
-        collected_outputs = [collected_outputs[str(k)] for k in output_keys]
-        return collected_outputs, generated_prompts, prompts_sampling_params
+
+        # convert dict of dicts to ordered list and make values immutable
+        collected_outputs_new = []
+        for k in output_keys:
+            output = collected_outputs[str(k)]
+            for k, list_values in output.items():
+                if isinstance(list_values, list):
+                    output[k] = tuple(list_values)
+            collected_outputs_new.append(output)
+
+        return collected_outputs_new, generated_prompts
diff --git a/tests/spyre_util.py b/tests/spyre_util.py