-
Notifications
You must be signed in to change notification settings - Fork 18
[CB] Add scheduling tests #329
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,6 +33,7 @@ def test_prompts_aligned_with_tkv_boundaries(model: str, backend: str, | |
steps_add_reqs = [0, 0, 0] # add all requests in the beginning | ||
available_blocks = -1 # no restriction | ||
max_num_seqs = 2 | ||
max_model_len = 256 | ||
|
||
checked_steps = [ | ||
{ | ||
|
@@ -170,6 +171,7 @@ def test_prompts_aligned_with_tkv_boundaries(model: str, backend: str, | |
steps_add_reqs=steps_add_reqs, | ||
checked_steps=checked_steps, | ||
max_num_seqs=max_num_seqs, | ||
max_model_len=max_model_len, | ||
available_blocks=available_blocks, | ||
use_cb=True, | ||
) | ||
|
@@ -197,6 +199,7 @@ def test_prompts_misaligned_with_tkv_boundaries( | |
steps_add_reqs = [0, 0, 0] # add all requests in the beginning | ||
available_blocks = -1 # no restriction | ||
max_num_seqs = 2 | ||
max_model_len = 256 | ||
|
||
checked_steps = [ | ||
{ | ||
|
@@ -332,6 +335,7 @@ def test_prompts_misaligned_with_tkv_boundaries( | |
steps_add_reqs=steps_add_reqs, | ||
checked_steps=checked_steps, | ||
max_num_seqs=max_num_seqs, | ||
max_model_len=max_model_len, | ||
available_blocks=available_blocks, | ||
use_cb=True, | ||
) | ||
|
@@ -358,6 +362,7 @@ def test_two_sequences_finish_same_time_as_new_arrive( | |
steps_add_reqs = [0, 0, 31] | ||
available_blocks = -1 # no restriction | ||
max_num_seqs = 2 | ||
max_model_len = 256 | ||
|
||
checked_steps = [ | ||
{ | ||
|
@@ -470,6 +475,270 @@ def test_two_sequences_finish_same_time_as_new_arrive( | |
steps_add_reqs=steps_add_reqs, | ||
checked_steps=checked_steps, | ||
max_num_seqs=max_num_seqs, | ||
max_model_len=max_model_len, | ||
available_blocks=available_blocks, | ||
use_cb=True, | ||
) | ||
|
||
|
||
@pytest.mark.cb | ||
@pytest.mark.parametrize("model", get_spyre_model_list()) | ||
@pytest.mark.parametrize("backend", get_spyre_backend_list()) | ||
def test_new_sequence_joins_during_decode(model: str, backend: str, | ||
monkeypatch: pytest.MonkeyPatch): | ||
""" Scenario where a new sequence joins while decoding other sequences | ||
|
||
Configuration: | ||
* max_num_seqs: 4 | ||
* number of prompts: 4 | ||
* 1: len = 49, max tokens = 119, step joining = 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggestion: maybe start counting at 0 here to use the sequence IDs |
||
* 2: len = 14, max tokens = 52, step joining = 0 | ||
* 3: len = 89, max tokens = 104, step joining = 32 | ||
* 4: len = 9, max tokens = 64, step joining = 131 | ||
""" | ||
# TODO change to 65 max_tokens for last prompt if ever possible | ||
|
||
seqs_max_tokens = [119, 52, 104, 64] | ||
prompts_lengths = [49, 14, 89, 9] | ||
steps_add_reqs = [0, 0, 32, 131] | ||
Comment on lines
+501
to
+503
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if we can lower the values here - the time for CB testing on CPU is on the rise, maybe (if possible) having shorter max_tokens can speed tests up if the test logic remains the same There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the eventual goal could be to reduce the total number of steps - the lesser the steps, the faster the test. I don't think we really need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something like
where request 0 would finish first, request 1 would be still decoding when request 2 shows up? Or am I missing something obvious? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this can be made to work with lesser |
||
available_blocks = -1 # no restriction | ||
max_num_seqs = 4 | ||
max_model_len = 256 | ||
|
||
checked_steps = [ | ||
{ | ||
"step": 0, | ||
"tkv": 0, | ||
"waiting": ["0", "1"], | ||
"running": [], | ||
"request_outputs": [], | ||
"n_reserved_blocks": 0, | ||
"n_used_blocks": 0 | ||
}, | ||
{ | ||
# Prefill sequence 0 | ||
"step": 1, | ||
"tkv": 64, | ||
"waiting": ["1"], | ||
"running": ["0"], | ||
"request_outputs": ["0"], | ||
"n_reserved_blocks": 3, # prefill (1 block) + 119 decode (2 block) | ||
"n_used_blocks": 1 | ||
}, | ||
{ | ||
# Prefill sequence 1 | ||
"step": 2, | ||
"tkv": 64, | ||
"waiting": [], | ||
"running": ["1", "0"], | ||
"request_outputs": ["1"], | ||
"n_reserved_blocks": 5, # prefill (1 block) + 51 decodes (1 block) | ||
"n_used_blocks": 2 | ||
}, | ||
{ | ||
# Decode sequences 0 and 1 | ||
"step": 3, | ||
"tkv": 65, | ||
"waiting": [], | ||
"running": ["1", "0"], | ||
"request_outputs": ["1", "0"], | ||
"n_reserved_blocks": 5, | ||
"n_used_blocks": 4 # 2 blocks extended, one for each sequence | ||
}, | ||
{ | ||
# Sequence 2 joins: one iteration in waiting queue | ||
"step": 32, | ||
"tkv": 94, | ||
"waiting": ["2"], | ||
"running": ["1", "0"], | ||
"request_outputs": ["1", "0"], | ||
"n_reserved_blocks": 5, | ||
"n_used_blocks": 4 | ||
}, | ||
{ | ||
# Prefill sequence 2 | ||
"step": 33, | ||
"tkv": 94, | ||
"waiting": [], | ||
"running": ["2", "1", "0"], | ||
"request_outputs": ["2"], | ||
"n_reserved_blocks": 9, # prefill (2 block) + 103 decode (2 block) | ||
"n_used_blocks": 6 | ||
}, | ||
{ | ||
# Decode sequences 0, 1, and 2 | ||
"step": 34, | ||
"tkv": 95, | ||
"waiting": [], | ||
"running": ["2", "1", "0"], | ||
"request_outputs": ["2", "1", "0"], | ||
"n_reserved_blocks": 9, | ||
"n_used_blocks": 6 | ||
}, | ||
{ | ||
# Sequence 1 finishes at step 54 | ||
# (start step + 2 prefills + 51 decodes - 1) = 2 + 2 + 51 - 1 = 54 | ||
"step": 54, | ||
"tkv": 115, | ||
"waiting": [], | ||
"running": ["2", "0"], | ||
"request_outputs": ["2", "1", "0"], | ||
"finished_requests": ["1"], | ||
"n_reserved_blocks": 9, | ||
"n_used_blocks": 6 | ||
}, | ||
{ | ||
# Decode sequences 0 and 2 | ||
"step": 55, | ||
"tkv": 116, | ||
"waiting": [], | ||
"running": ["2", "0"], | ||
"request_outputs": ["2", "0"], | ||
"n_reserved_blocks": 7, # two blocks released | ||
"n_used_blocks": 4 # two blocks released | ||
}, | ||
{ | ||
# Decode sequences 0 and 2, tkv arrives to new block | ||
"step": 68, | ||
"tkv": 129, | ||
"waiting": [], | ||
"running": ["2", "0"], | ||
"request_outputs": ["2", "0"], | ||
"n_reserved_blocks": 7, | ||
"n_used_blocks": 6 # 2 blocks extended, one for each sequence | ||
}, | ||
{ | ||
# Sequence 0 finishes at step 121 | ||
# (start step + 3 prefills + 118 decode - 1) = 1 + 3 + 118 - 1 = 121 | ||
"step": 121, | ||
"tkv": 182, | ||
"waiting": [], | ||
"running": ["2"], | ||
"request_outputs": ["2", "0"], | ||
"finished_requests": ["0"], | ||
"n_reserved_blocks": 7, | ||
"n_used_blocks": 6 | ||
}, | ||
{ | ||
# Decode sequence 2 | ||
"step": 122, | ||
"tkv": 183, | ||
"waiting": [], | ||
"running": ["2"], | ||
"request_outputs": ["2"], | ||
"n_reserved_blocks": 4, # 3 blocks released | ||
"n_used_blocks": 3 # 3 blocks released | ||
}, | ||
{ | ||
# Sequence 3 joins: one iteration in waiting queue | ||
"step": 131, | ||
"tkv": 192, | ||
"waiting": ["3"], | ||
"running": ["2"], | ||
"request_outputs": ["2"], | ||
"n_reserved_blocks": 4, | ||
"n_used_blocks": 3 | ||
}, | ||
{ | ||
# Prefill sequence 3 | ||
"step": 132, | ||
"tkv": 192, | ||
"waiting": [], | ||
"running": ["3", "2"], | ||
"request_outputs": ["3"], | ||
"n_reserved_blocks": 8, # prefill (3 blocks) + 63 decode (1 block) | ||
"n_used_blocks": 6 # prefill (3 block) | ||
}, | ||
{ | ||
# Decode sequences 2 and 3 | ||
"step": 133, | ||
"tkv": 193, | ||
"waiting": [], | ||
"running": ["3", "2"], | ||
"request_outputs": ["3", "2"], | ||
"n_reserved_blocks": 8, | ||
"n_used_blocks": 8 # 2 blocks extended, one for each sequence | ||
}, | ||
{ | ||
# Sequence 2 finishes at step 137 | ||
# (start step + 2 prefills + 103 decodes) = 33 + 2 + 103 - 1 = 137 | ||
"step": 137, | ||
"tkv": 197, | ||
"waiting": [], | ||
"running": ["3"], | ||
"request_outputs": ["3", "2"], | ||
"finished_requests": ["2"], | ||
"n_reserved_blocks": 8, | ||
"n_used_blocks": 8 | ||
}, | ||
{ | ||
# Decode sequence 3 | ||
"step": 138, | ||
"tkv": 70, | ||
"waiting": [], | ||
"running": ["3"], | ||
"request_outputs": ["3"], | ||
# 6 blocks freed: finished sequence (4) + left padding stripping (2) | ||
"n_reserved_blocks": 2, | ||
"n_used_blocks": 2 | ||
}, | ||
{ | ||
# Sequence 3 finishes at step 196 | ||
# (start step + 1 prefills + 103 decodes) = 132 + 1 + 63 - 1 = 196 | ||
"step": 195, | ||
"tkv": 127, | ||
"waiting": [], | ||
"running": [], | ||
"request_outputs": ["3"], | ||
"finished_requests": ["3"], | ||
"n_reserved_blocks": 2, | ||
"n_used_blocks": 2 | ||
}, | ||
{ | ||
# Tkv should be cleared one step later | ||
"step": 196, | ||
"tkv": 0, | ||
"waiting": [], | ||
"running": [], | ||
"request_outputs": [], | ||
"n_reserved_blocks": 0, | ||
"n_used_blocks": 0 | ||
}, | ||
# TODO this is when max_tokens = 65 for last prompt | ||
# { | ||
# # Sequence 3 finishes at step 196 | ||
# # (start step + 1 prefills + 103 decodes) = 132 + 1 + 64 - 1 = 196 | ||
# "step": 196, | ||
# "tkv": 128, | ||
# "waiting": [], | ||
# "running": [], | ||
# "request_outputs": ["3"], | ||
# "finished_requests": ["3"], | ||
# "n_reserved_blocks": 2, | ||
# "n_used_blocks": 2 | ||
# }, | ||
# { | ||
# # Tkv should be cleared one step later | ||
# "step": 197, | ||
# "tkv": 0, | ||
# "waiting": [], | ||
# "running": [], | ||
# "request_outputs": [], | ||
# "n_reserved_blocks": 0, | ||
# "n_used_blocks": 0 | ||
# }, | ||
] | ||
|
||
check_scheduler_inference_steps( | ||
model=model, | ||
backend=backend, | ||
monkeypatch=monkeypatch, | ||
seqs_max_tokens=seqs_max_tokens, | ||
prompts_lengths=prompts_lengths, | ||
steps_add_reqs=steps_add_reqs, | ||
checked_steps=checked_steps, | ||
max_num_seqs=max_num_seqs, | ||
max_model_len=max_model_len, | ||
available_blocks=available_blocks, | ||
use_cb=True, | ||
) | ||
|
@@ -494,6 +763,7 @@ def test_prompt_too_long_for_current_tkv(model: str, backend: str, | |
steps_add_reqs = [0, 0] | ||
available_blocks = -1 # no restriction | ||
max_num_seqs = 2 | ||
max_model_len = 256 | ||
|
||
checked_steps = [ | ||
{ | ||
|
@@ -617,6 +887,7 @@ def test_prompt_too_long_for_current_tkv(model: str, backend: str, | |
steps_add_reqs=steps_add_reqs, | ||
checked_steps=checked_steps, | ||
max_num_seqs=max_num_seqs, | ||
max_model_len=max_model_len, | ||
available_blocks=available_blocks, | ||
use_cb=True, | ||
) | ||
|
@@ -642,6 +913,7 @@ def test_requested_tokens_not_fitting_remaining_space( | |
steps_add_reqs = [0, 0, 0] | ||
available_blocks = -1 # no restriction | ||
max_num_seqs = 2 | ||
max_model_len = 256 | ||
|
||
checked_steps = [ | ||
{ | ||
|
@@ -802,6 +1074,7 @@ def test_requested_tokens_not_fitting_remaining_space( | |
steps_add_reqs=steps_add_reqs, | ||
checked_steps=checked_steps, | ||
max_num_seqs=max_num_seqs, | ||
max_model_len=max_model_len, | ||
available_blocks=available_blocks, | ||
use_cb=True, | ||
) | ||
|
@@ -830,6 +1103,8 @@ def test_requests_use_all_available_blocks(model: str, backend: str, | |
# total number of blocks needed if scheduled together : 4 * (1 + 1) = 8 | ||
available_blocks = 8 | ||
max_num_seqs = 4 | ||
max_model_len = 256 | ||
|
||
checked_steps = [ | ||
{ | ||
"step": 0, | ||
|
@@ -933,6 +1208,7 @@ def test_requests_use_all_available_blocks(model: str, backend: str, | |
steps_add_reqs=steps_add_reqs, | ||
checked_steps=checked_steps, | ||
max_num_seqs=max_num_seqs, | ||
max_model_len=max_model_len, | ||
available_blocks=available_blocks, | ||
use_cb=True, | ||
) | ||
|
@@ -962,6 +1238,8 @@ def test_requests_use_more_than_available_blocks( | |
# total number of blocks needed if scheduled together : 4 * (1 + 1) = 8 | ||
available_blocks = 4 | ||
max_num_seqs = 4 | ||
max_model_len = 256 | ||
|
||
checked_steps = [ | ||
{ | ||
"step": 0, | ||
|
@@ -1090,6 +1368,7 @@ def test_requests_use_more_than_available_blocks( | |
steps_add_reqs=steps_add_reqs, | ||
checked_steps=checked_steps, | ||
max_num_seqs=max_num_seqs, | ||
max_model_len=max_model_len, | ||
available_blocks=available_blocks, | ||
use_cb=True, | ||
) |
Uh oh!
There was an error while loading. Please reload this page.