8
8
9
9
import pytest
10
10
from scheduling_utils import check_scheduler_inference_steps
11
- from spyre_util import get_spyre_backend_list , get_spyre_model_list
11
+ from spyre_util import (compare_results , generate_hf_output ,
12
+ get_spyre_backend_list , get_spyre_model_list )
12
13
13
14
14
15
@pytest .mark .cb
@@ -34,6 +35,8 @@ def test_prompts_aligned_with_tkv_boundaries(model: str, backend: str,
34
35
available_blocks = - 1 # no restriction
35
36
max_num_seqs = 2
36
37
max_model_len = 256
38
+ # check_output = backend == "sendnn"
39
+ check_output = True
37
40
38
41
checked_steps = [
39
42
{
@@ -162,7 +165,7 @@ def test_prompts_aligned_with_tkv_boundaries(model: str, backend: str,
162
165
},
163
166
]
164
167
165
- check_scheduler_inference_steps (
168
+ cb_outputs , prompts = check_scheduler_inference_steps (
166
169
model = model ,
167
170
backend = backend ,
168
171
monkeypatch = monkeypatch ,
@@ -174,8 +177,22 @@ def test_prompts_aligned_with_tkv_boundaries(model: str, backend: str,
174
177
max_model_len = max_model_len ,
175
178
available_blocks = available_blocks ,
176
179
use_cb = True ,
180
+ collect_outputs = check_output ,
177
181
)
178
182
183
+ if check_output :
184
+ hf_outputs = generate_hf_output (
185
+ model = model ,
186
+ prompts = prompts ,
187
+ max_new_tokens = seqs_max_tokens ,
188
+ ignore_eos = True ,
189
+ )
190
+ compare_results (model = model ,
191
+ tensor_parallel_size = 1 ,
192
+ backend = backend ,
193
+ vllm_results = cb_outputs ,
194
+ hf_results = hf_outputs )
195
+
179
196
180
197
@pytest .mark .cb
181
198
@pytest .mark .parametrize ("model" , get_spyre_model_list ())
@@ -200,6 +217,8 @@ def test_prompts_misaligned_with_tkv_boundaries(
200
217
available_blocks = - 1 # no restriction
201
218
max_num_seqs = 2
202
219
max_model_len = 256
220
+ # check_output = backend == "sendnn"
221
+ check_output = True
203
222
204
223
checked_steps = [
205
224
{
@@ -326,7 +345,7 @@ def test_prompts_misaligned_with_tkv_boundaries(
326
345
},
327
346
]
328
347
329
- check_scheduler_inference_steps (
348
+ cb_outputs , prompts = check_scheduler_inference_steps (
330
349
model = model ,
331
350
backend = backend ,
332
351
monkeypatch = monkeypatch ,
@@ -338,8 +357,22 @@ def test_prompts_misaligned_with_tkv_boundaries(
338
357
max_model_len = max_model_len ,
339
358
available_blocks = available_blocks ,
340
359
use_cb = True ,
360
+ collect_outputs = check_output ,
341
361
)
342
362
363
+ if check_output :
364
+ hf_outputs = generate_hf_output (
365
+ model = model ,
366
+ prompts = prompts ,
367
+ max_new_tokens = seqs_max_tokens ,
368
+ ignore_eos = True ,
369
+ )
370
+ compare_results (model = model ,
371
+ tensor_parallel_size = 1 ,
372
+ backend = backend ,
373
+ vllm_results = cb_outputs ,
374
+ hf_results = hf_outputs )
375
+
343
376
344
377
@pytest .mark .cb
345
378
@pytest .mark .parametrize ("model" , get_spyre_model_list ())
@@ -363,6 +396,8 @@ def test_two_sequences_finish_same_time_as_new_arrive(
363
396
available_blocks = - 1 # no restriction
364
397
max_num_seqs = 2
365
398
max_model_len = 256
399
+ # check_output = backend == "sendnn"
400
+ check_output = True
366
401
367
402
checked_steps = [
368
403
{
@@ -466,7 +501,7 @@ def test_two_sequences_finish_same_time_as_new_arrive(
466
501
},
467
502
]
468
503
469
- check_scheduler_inference_steps (
504
+ cb_outputs , prompts = check_scheduler_inference_steps (
470
505
model = model ,
471
506
backend = backend ,
472
507
monkeypatch = monkeypatch ,
@@ -478,8 +513,22 @@ def test_two_sequences_finish_same_time_as_new_arrive(
478
513
max_model_len = max_model_len ,
479
514
available_blocks = available_blocks ,
480
515
use_cb = True ,
516
+ collect_outputs = check_output ,
481
517
)
482
518
519
+ if check_output :
520
+ hf_outputs = generate_hf_output (
521
+ model = model ,
522
+ prompts = prompts ,
523
+ max_new_tokens = seqs_max_tokens ,
524
+ ignore_eos = True ,
525
+ )
526
+ compare_results (model = model ,
527
+ tensor_parallel_size = 1 ,
528
+ backend = backend ,
529
+ vllm_results = cb_outputs ,
530
+ hf_results = hf_outputs )
531
+
483
532
484
533
@pytest .mark .cb
485
534
@pytest .mark .parametrize ("model" , get_spyre_model_list ())
@@ -504,6 +553,8 @@ def test_new_sequence_joins_during_decode(model: str, backend: str,
504
553
available_blocks = - 1 # no restriction
505
554
max_num_seqs = 4
506
555
max_model_len = 256
556
+ # check_output = backend == "sendnn"
557
+ check_output = True
507
558
508
559
checked_steps = [
509
560
{
@@ -729,7 +780,7 @@ def test_new_sequence_joins_during_decode(model: str, backend: str,
729
780
# },
730
781
]
731
782
732
- check_scheduler_inference_steps (
783
+ cb_outputs , prompts = check_scheduler_inference_steps (
733
784
model = model ,
734
785
backend = backend ,
735
786
monkeypatch = monkeypatch ,
@@ -741,8 +792,22 @@ def test_new_sequence_joins_during_decode(model: str, backend: str,
741
792
max_model_len = max_model_len ,
742
793
available_blocks = available_blocks ,
743
794
use_cb = True ,
795
+ collect_outputs = check_output ,
744
796
)
745
797
798
+ if check_output :
799
+ hf_outputs = generate_hf_output (
800
+ model = model ,
801
+ prompts = prompts ,
802
+ max_new_tokens = seqs_max_tokens ,
803
+ ignore_eos = True ,
804
+ )
805
+ compare_results (model = model ,
806
+ tensor_parallel_size = 1 ,
807
+ backend = backend ,
808
+ vllm_results = cb_outputs ,
809
+ hf_results = hf_outputs )
810
+
746
811
747
812
@pytest .mark .cb
748
813
@pytest .mark .parametrize ("model" , get_spyre_model_list ())
@@ -764,6 +829,7 @@ def test_prompt_too_long_for_current_tkv(model: str, backend: str,
764
829
available_blocks = - 1 # no restriction
765
830
max_num_seqs = 2
766
831
max_model_len = 256
832
+ check_output = False
767
833
768
834
checked_steps = [
769
835
{
@@ -878,7 +944,7 @@ def test_prompt_too_long_for_current_tkv(model: str, backend: str,
878
944
},
879
945
]
880
946
881
- check_scheduler_inference_steps (
947
+ cb_outputs , prompts = check_scheduler_inference_steps (
882
948
model = model ,
883
949
backend = backend ,
884
950
monkeypatch = monkeypatch ,
@@ -890,15 +956,30 @@ def test_prompt_too_long_for_current_tkv(model: str, backend: str,
890
956
max_model_len = max_model_len ,
891
957
available_blocks = available_blocks ,
892
958
use_cb = True ,
959
+ collect_outputs = check_output ,
893
960
)
894
961
962
+ if check_output :
963
+ hf_outputs = generate_hf_output (
964
+ model = model ,
965
+ prompts = prompts ,
966
+ max_new_tokens = seqs_max_tokens ,
967
+ ignore_eos = True ,
968
+ )
969
+ compare_results (model = model ,
970
+ tensor_parallel_size = 1 ,
971
+ backend = backend ,
972
+ vllm_results = cb_outputs ,
973
+ hf_results = hf_outputs )
974
+
895
975
896
976
@pytest .mark .cb
897
977
@pytest .mark .parametrize ("model" , get_spyre_model_list ())
898
978
@pytest .mark .parametrize ("backend" , get_spyre_backend_list ())
899
979
def test_requested_tokens_not_fitting_remaining_space (
900
980
model : str , backend : str , monkeypatch : pytest .MonkeyPatch ):
901
- """ Scenario where the request goes beyond max_model_len
981
+ """ Scenario where the request goes beyond max_model_len and needs to wait
982
+ for a new batch.
902
983
903
984
Configuration:
904
985
* max_num_seqs: 2
@@ -914,6 +995,7 @@ def test_requested_tokens_not_fitting_remaining_space(
914
995
available_blocks = - 1 # no restriction
915
996
max_num_seqs = 2
916
997
max_model_len = 256
998
+ check_output = False
917
999
918
1000
checked_steps = [
919
1001
{
@@ -1065,7 +1147,7 @@ def test_requested_tokens_not_fitting_remaining_space(
1065
1147
},
1066
1148
]
1067
1149
1068
- check_scheduler_inference_steps (
1150
+ cb_outputs , prompts = check_scheduler_inference_steps (
1069
1151
model = model ,
1070
1152
backend = backend ,
1071
1153
monkeypatch = monkeypatch ,
@@ -1077,8 +1159,22 @@ def test_requested_tokens_not_fitting_remaining_space(
1077
1159
max_model_len = max_model_len ,
1078
1160
available_blocks = available_blocks ,
1079
1161
use_cb = True ,
1162
+ collect_outputs = check_output ,
1080
1163
)
1081
1164
1165
+ if check_output :
1166
+ hf_outputs = generate_hf_output (
1167
+ model = model ,
1168
+ prompts = prompts ,
1169
+ max_new_tokens = seqs_max_tokens ,
1170
+ ignore_eos = True ,
1171
+ )
1172
+ compare_results (model = model ,
1173
+ tensor_parallel_size = 1 ,
1174
+ backend = backend ,
1175
+ vllm_results = cb_outputs ,
1176
+ hf_results = hf_outputs )
1177
+
1082
1178
1083
1179
@pytest .mark .cb
1084
1180
@pytest .mark .parametrize ("model" , get_spyre_model_list ())
@@ -1104,6 +1200,8 @@ def test_requests_use_all_available_blocks(model: str, backend: str,
1104
1200
available_blocks = 8
1105
1201
max_num_seqs = 4
1106
1202
max_model_len = 256
1203
+ # check_output = backend == "sendnn"
1204
+ check_output = True
1107
1205
1108
1206
checked_steps = [
1109
1207
{
@@ -1199,7 +1297,7 @@ def test_requests_use_all_available_blocks(model: str, backend: str,
1199
1297
},
1200
1298
]
1201
1299
1202
- check_scheduler_inference_steps (
1300
+ cb_outputs , prompts = check_scheduler_inference_steps (
1203
1301
model = model ,
1204
1302
backend = backend ,
1205
1303
monkeypatch = monkeypatch ,
@@ -1211,8 +1309,22 @@ def test_requests_use_all_available_blocks(model: str, backend: str,
1211
1309
max_model_len = max_model_len ,
1212
1310
available_blocks = available_blocks ,
1213
1311
use_cb = True ,
1312
+ collect_outputs = check_output ,
1214
1313
)
1215
1314
1315
+ if check_output :
1316
+ hf_outputs = generate_hf_output (
1317
+ model = model ,
1318
+ prompts = prompts ,
1319
+ max_new_tokens = seqs_max_tokens ,
1320
+ ignore_eos = True ,
1321
+ )
1322
+ compare_results (model = model ,
1323
+ tensor_parallel_size = 1 ,
1324
+ backend = backend ,
1325
+ vllm_results = cb_outputs ,
1326
+ hf_results = hf_outputs )
1327
+
1216
1328
1217
1329
@pytest .mark .cb
1218
1330
@pytest .mark .parametrize ("model" , get_spyre_model_list ())
@@ -1239,6 +1351,8 @@ def test_requests_use_more_than_available_blocks(
1239
1351
available_blocks = 4
1240
1352
max_num_seqs = 4
1241
1353
max_model_len = 256
1354
+ # check_output = backend == "sendnn"
1355
+ check_output = True
1242
1356
1243
1357
checked_steps = [
1244
1358
{
@@ -1359,7 +1473,7 @@ def test_requests_use_more_than_available_blocks(
1359
1473
},
1360
1474
]
1361
1475
1362
- check_scheduler_inference_steps (
1476
+ cb_outputs , prompts = check_scheduler_inference_steps (
1363
1477
model = model ,
1364
1478
backend = backend ,
1365
1479
monkeypatch = monkeypatch ,
@@ -1371,4 +1485,18 @@ def test_requests_use_more_than_available_blocks(
1371
1485
max_model_len = max_model_len ,
1372
1486
available_blocks = available_blocks ,
1373
1487
use_cb = True ,
1488
+ collect_outputs = check_output ,
1374
1489
)
1490
+
1491
+ if check_output :
1492
+ hf_outputs = generate_hf_output (
1493
+ model = model ,
1494
+ prompts = prompts ,
1495
+ max_new_tokens = seqs_max_tokens ,
1496
+ ignore_eos = True ,
1497
+ )
1498
+ compare_results (model = model ,
1499
+ tensor_parallel_size = 1 ,
1500
+ backend = backend ,
1501
+ vllm_results = cb_outputs ,
1502
+ hf_results = hf_outputs )
0 commit comments