Skip to content

[CB] Add scheduling tests #329

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
279 changes: 279 additions & 0 deletions tests/e2e/test_spyre_cb_scheduler_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def test_prompts_aligned_with_tkv_boundaries(model: str, backend: str,
steps_add_reqs = [0, 0, 0] # add all requests in the beginning
available_blocks = -1 # no restriction
max_num_seqs = 2
max_model_len = 256

checked_steps = [
{
Expand Down Expand Up @@ -170,6 +171,7 @@ def test_prompts_aligned_with_tkv_boundaries(model: str, backend: str,
steps_add_reqs=steps_add_reqs,
checked_steps=checked_steps,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
available_blocks=available_blocks,
use_cb=True,
)
Expand Down Expand Up @@ -197,6 +199,7 @@ def test_prompts_misaligned_with_tkv_boundaries(
steps_add_reqs = [0, 0, 0] # add all requests in the beginning
available_blocks = -1 # no restriction
max_num_seqs = 2
max_model_len = 256

checked_steps = [
{
Expand Down Expand Up @@ -332,6 +335,7 @@ def test_prompts_misaligned_with_tkv_boundaries(
steps_add_reqs=steps_add_reqs,
checked_steps=checked_steps,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
available_blocks=available_blocks,
use_cb=True,
)
Expand All @@ -358,6 +362,7 @@ def test_two_sequences_finish_same_time_as_new_arrive(
steps_add_reqs = [0, 0, 31]
available_blocks = -1 # no restriction
max_num_seqs = 2
max_model_len = 256

checked_steps = [
{
Expand Down Expand Up @@ -470,6 +475,270 @@ def test_two_sequences_finish_same_time_as_new_arrive(
steps_add_reqs=steps_add_reqs,
checked_steps=checked_steps,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
available_blocks=available_blocks,
use_cb=True,
)


@pytest.mark.cb
@pytest.mark.parametrize("model", get_spyre_model_list())
@pytest.mark.parametrize("backend", get_spyre_backend_list())
def test_new_sequence_joins_during_decode(model: str, backend: str,
monkeypatch: pytest.MonkeyPatch):
""" Scenario where a new sequence joins while decoding other sequences

Configuration:
* max_num_seqs: 4
* number of prompts: 4
* 1: len = 49, max tokens = 119, step joining = 0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: maybe start counting at 0 here to use the sequence IDs

* 2: len = 14, max tokens = 52, step joining = 0
* 3: len = 89, max tokens = 104, step joining = 32
* 4: len = 9, max tokens = 64, step joining = 131
"""
# TODO change to 65 max_tokens for last prompt if ever possible

seqs_max_tokens = [119, 52, 104, 64]
prompts_lengths = [49, 14, 89, 9]
steps_add_reqs = [0, 0, 32, 131]
Comment on lines +501 to +503
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we can lower the values here - the time for CB testing on CPU is on the rise, maybe (if possible) having shorter max_tokens can speed tests up if the test logic remains the same

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the eventual goal could be to reduce the total number of steps - the lesser the steps, the faster the test. I don't think we really need 197 steps for this test case?

Copy link
Collaborator

@prashantgupta24 prashantgupta24 Jul 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something like

    seqs_max_tokens = [3, 10, 5]
    prompts_lengths = [10, 10, 10]
    steps_add_reqs = [0, 0, 5]

where request 0 would finish first, request 1 would be still decoding when request 2 shows up? Or am I missing something obvious?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this can be made to work with lesser max_tokens, then perhaps we can open an issue to change all tests within this file to use lesser values to speed things up?

available_blocks = -1 # no restriction
max_num_seqs = 4
max_model_len = 256

checked_steps = [
{
"step": 0,
"tkv": 0,
"waiting": ["0", "1"],
"running": [],
"request_outputs": [],
"n_reserved_blocks": 0,
"n_used_blocks": 0
},
{
# Prefill sequence 0
"step": 1,
"tkv": 64,
"waiting": ["1"],
"running": ["0"],
"request_outputs": ["0"],
"n_reserved_blocks": 3, # prefill (1 block) + 119 decode (2 block)
"n_used_blocks": 1
},
{
# Prefill sequence 1
"step": 2,
"tkv": 64,
"waiting": [],
"running": ["1", "0"],
"request_outputs": ["1"],
"n_reserved_blocks": 5, # prefill (1 block) + 51 decodes (1 block)
"n_used_blocks": 2
},
{
# Decode sequences 0 and 1
"step": 3,
"tkv": 65,
"waiting": [],
"running": ["1", "0"],
"request_outputs": ["1", "0"],
"n_reserved_blocks": 5,
"n_used_blocks": 4 # 2 blocks extended, one for each sequence
},
{
# Sequence 2 joins: one iteration in waiting queue
"step": 32,
"tkv": 94,
"waiting": ["2"],
"running": ["1", "0"],
"request_outputs": ["1", "0"],
"n_reserved_blocks": 5,
"n_used_blocks": 4
},
{
# Prefill sequence 2
"step": 33,
"tkv": 94,
"waiting": [],
"running": ["2", "1", "0"],
"request_outputs": ["2"],
"n_reserved_blocks": 9, # prefill (2 block) + 103 decode (2 block)
"n_used_blocks": 6
},
{
# Decode sequences 0, 1, and 2
"step": 34,
"tkv": 95,
"waiting": [],
"running": ["2", "1", "0"],
"request_outputs": ["2", "1", "0"],
"n_reserved_blocks": 9,
"n_used_blocks": 6
},
{
# Sequence 1 finishes at step 54
# (start step + 2 prefills + 51 decodes - 1) = 2 + 2 + 51 - 1 = 54
"step": 54,
"tkv": 115,
"waiting": [],
"running": ["2", "0"],
"request_outputs": ["2", "1", "0"],
"finished_requests": ["1"],
"n_reserved_blocks": 9,
"n_used_blocks": 6
},
{
# Decode sequences 0 and 2
"step": 55,
"tkv": 116,
"waiting": [],
"running": ["2", "0"],
"request_outputs": ["2", "0"],
"n_reserved_blocks": 7, # two blocks released
"n_used_blocks": 4 # two blocks released
},
{
# Decode sequences 0 and 2, tkv arrives to new block
"step": 68,
"tkv": 129,
"waiting": [],
"running": ["2", "0"],
"request_outputs": ["2", "0"],
"n_reserved_blocks": 7,
"n_used_blocks": 6 # 2 blocks extended, one for each sequence
},
{
# Sequence 0 finishes at step 121
# (start step + 3 prefills + 118 decode - 1) = 1 + 3 + 118 - 1 = 121
"step": 121,
"tkv": 182,
"waiting": [],
"running": ["2"],
"request_outputs": ["2", "0"],
"finished_requests": ["0"],
"n_reserved_blocks": 7,
"n_used_blocks": 6
},
{
# Decode sequence 2
"step": 122,
"tkv": 183,
"waiting": [],
"running": ["2"],
"request_outputs": ["2"],
"n_reserved_blocks": 4, # 3 blocks released
"n_used_blocks": 3 # 3 blocks released
},
{
# Sequence 3 joins: one iteration in waiting queue
"step": 131,
"tkv": 192,
"waiting": ["3"],
"running": ["2"],
"request_outputs": ["2"],
"n_reserved_blocks": 4,
"n_used_blocks": 3
},
{
# Prefill sequence 3
"step": 132,
"tkv": 192,
"waiting": [],
"running": ["3", "2"],
"request_outputs": ["3"],
"n_reserved_blocks": 8, # prefill (3 blocks) + 63 decode (1 block)
"n_used_blocks": 6 # prefill (3 block)
},
{
# Decode sequences 2 and 3
"step": 133,
"tkv": 193,
"waiting": [],
"running": ["3", "2"],
"request_outputs": ["3", "2"],
"n_reserved_blocks": 8,
"n_used_blocks": 8 # 2 blocks extended, one for each sequence
},
{
# Sequence 2 finishes at step 137
# (start step + 2 prefills + 103 decodes) = 33 + 2 + 103 - 1 = 137
"step": 137,
"tkv": 197,
"waiting": [],
"running": ["3"],
"request_outputs": ["3", "2"],
"finished_requests": ["2"],
"n_reserved_blocks": 8,
"n_used_blocks": 8
},
{
# Decode sequence 3
"step": 138,
"tkv": 70,
"waiting": [],
"running": ["3"],
"request_outputs": ["3"],
# 6 blocks freed: finished sequence (4) + left padding stripping (2)
"n_reserved_blocks": 2,
"n_used_blocks": 2
},
{
# Sequence 3 finishes at step 196
# (start step + 1 prefills + 103 decodes) = 132 + 1 + 63 - 1 = 196
"step": 195,
"tkv": 127,
"waiting": [],
"running": [],
"request_outputs": ["3"],
"finished_requests": ["3"],
"n_reserved_blocks": 2,
"n_used_blocks": 2
},
{
# Tkv should be cleared one step later
"step": 196,
"tkv": 0,
"waiting": [],
"running": [],
"request_outputs": [],
"n_reserved_blocks": 0,
"n_used_blocks": 0
},
# TODO this is when max_tokens = 65 for last prompt
# {
# # Sequence 3 finishes at step 196
# # (start step + 1 prefills + 103 decodes) = 132 + 1 + 64 - 1 = 196
# "step": 196,
# "tkv": 128,
# "waiting": [],
# "running": [],
# "request_outputs": ["3"],
# "finished_requests": ["3"],
# "n_reserved_blocks": 2,
# "n_used_blocks": 2
# },
# {
# # Tkv should be cleared one step later
# "step": 197,
# "tkv": 0,
# "waiting": [],
# "running": [],
# "request_outputs": [],
# "n_reserved_blocks": 0,
# "n_used_blocks": 0
# },
]

check_scheduler_inference_steps(
model=model,
backend=backend,
monkeypatch=monkeypatch,
seqs_max_tokens=seqs_max_tokens,
prompts_lengths=prompts_lengths,
steps_add_reqs=steps_add_reqs,
checked_steps=checked_steps,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
available_blocks=available_blocks,
use_cb=True,
)
Expand All @@ -494,6 +763,7 @@ def test_prompt_too_long_for_current_tkv(model: str, backend: str,
steps_add_reqs = [0, 0]
available_blocks = -1 # no restriction
max_num_seqs = 2
max_model_len = 256

checked_steps = [
{
Expand Down Expand Up @@ -617,6 +887,7 @@ def test_prompt_too_long_for_current_tkv(model: str, backend: str,
steps_add_reqs=steps_add_reqs,
checked_steps=checked_steps,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
available_blocks=available_blocks,
use_cb=True,
)
Expand All @@ -642,6 +913,7 @@ def test_requested_tokens_not_fitting_remaining_space(
steps_add_reqs = [0, 0, 0]
available_blocks = -1 # no restriction
max_num_seqs = 2
max_model_len = 256

checked_steps = [
{
Expand Down Expand Up @@ -802,6 +1074,7 @@ def test_requested_tokens_not_fitting_remaining_space(
steps_add_reqs=steps_add_reqs,
checked_steps=checked_steps,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
available_blocks=available_blocks,
use_cb=True,
)
Expand Down Expand Up @@ -830,6 +1103,8 @@ def test_requests_use_all_available_blocks(model: str, backend: str,
# total number of blocks needed if scheduled together : 4 * (1 + 1) = 8
available_blocks = 8
max_num_seqs = 4
max_model_len = 256

checked_steps = [
{
"step": 0,
Expand Down Expand Up @@ -933,6 +1208,7 @@ def test_requests_use_all_available_blocks(model: str, backend: str,
steps_add_reqs=steps_add_reqs,
checked_steps=checked_steps,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
available_blocks=available_blocks,
use_cb=True,
)
Expand Down Expand Up @@ -962,6 +1238,8 @@ def test_requests_use_more_than_available_blocks(
# total number of blocks needed if scheduled together : 4 * (1 + 1) = 8
available_blocks = 4
max_num_seqs = 4
max_model_len = 256

checked_steps = [
{
"step": 0,
Expand Down Expand Up @@ -1090,6 +1368,7 @@ def test_requests_use_more_than_available_blocks(
steps_add_reqs=steps_add_reqs,
checked_steps=checked_steps,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
available_blocks=available_blocks,
use_cb=True,
)
1 change: 1 addition & 0 deletions tests/scheduling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def check_scheduler_inference_steps(
steps_add_reqs: list[int],
checked_steps: list[dict[str, Any]],
max_num_seqs: int,
max_model_len: int,
available_blocks: int,
use_cb: bool = True,
):
Expand Down
Loading