@@ -1041,3 +1041,297 @@ def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
1041
1041
for seq in scheduled_seq_group .seq_group .seqs :
1042
1042
seq .status = SequenceStatus .FINISHED_STOPPED
1043
1043
scheduler .free_finished_seq_groups ()
1044
+
1045
+
1046
+ def test_remove_seq_from_computed_blocks_tracker ():
1047
+ """
1048
+ Test that computed_blocks_tracker correctly removes stale sequences
1049
+ during scheduling.
1050
+
1051
+ The test covers 9 scheduling branches where stale seqs are removed:
1052
+ - 1 in _schedule_swapped
1053
+ - 1 in _schedule_priority_preemption
1054
+ - 7 in _schedule_prefill
1055
+
1056
+ Each branch is tested to ensure proper cleanup of
1057
+ _seq_id_to_num_tokens_computed.
1058
+ """
1059
+ # Budget can not schedule in swapped
1060
+ block_size = 2
1061
+ max_seq_group = 3
1062
+ seq_tokens_with_swapped : list [list [int ]] = []
1063
+ blocks_to_swap_out : list [tuple [int , int ]] = []
1064
+ curr_loras : set [int ] = set ()
1065
+
1066
+ scheduler = initialize_scheduler (
1067
+ block_size = block_size ,
1068
+ num_cpu_blocks = 64 ,
1069
+ num_gpu_blocks = 16 ,
1070
+ max_num_seqs = max_seq_group ,
1071
+ enable_prefix_caching = True ,
1072
+ )
1073
+ budget = create_token_budget (token_budget = 15 )
1074
+
1075
+ seq_length = 16
1076
+ num_seqs = 3
1077
+ for i in range (num_seqs ):
1078
+ seq_tokens_with_swapped .append ([i ] * seq_length )
1079
+
1080
+ seq_and_seq_groups = [
1081
+ create_dummy_prompt (f"{ i } " ,
1082
+ prompt_tokens = seq_tokens_with_swapped [i ],
1083
+ block_size = block_size )
1084
+ for i in range (len (seq_tokens_with_swapped ))
1085
+ ]
1086
+
1087
+ for _ , seq_group in seq_and_seq_groups :
1088
+ scheduler ._allocate_and_set_running (seq_group )
1089
+ scheduler ._swap_out (seq_group , blocks_to_swap_out )
1090
+ scheduler ._add_seq_group_to_swapped (seq_group )
1091
+
1092
+ scheduler ._schedule_swapped (budget , curr_loras )
1093
+ seq_id_to_num_tokens_computed = (
1094
+ scheduler .block_manager ._computed_blocks_tracker .
1095
+ _seq_id_to_num_tokens_computed .get (1 ))
1096
+ assert seq_id_to_num_tokens_computed is None
1097
+
1098
+ # Prefill schedule don't have a space for another LoRA, so
1099
+ # we ignore this request for now.
1100
+ block_size = 4
1101
+ lora_config = LoRAConfig (max_lora_rank = 8 , max_loras = 1 )
1102
+ scheduler = initialize_scheduler (lora_config = lora_config ,
1103
+ block_size = block_size ,
1104
+ num_cpu_blocks = 64 ,
1105
+ num_gpu_blocks = 64 ,
1106
+ enable_prefix_caching = True )
1107
+ budget = create_token_budget (token_budget = 120 )
1108
+ num_seqs = 2
1109
+ for i in range (num_seqs ):
1110
+ _ , seq_group = create_dummy_prompt (str (i ),
1111
+ prompt_length = seq_length ,
1112
+ block_size = block_size ,
1113
+ lora_request = LoRARequest (
1114
+ lora_name = str (i ),
1115
+ lora_int_id = i + 1 ,
1116
+ lora_path = "abc" ))
1117
+ scheduler .add_seq_group (seq_group )
1118
+
1119
+ scheduler ._schedule_prefills (budget , curr_loras )
1120
+ seq_id_to_num_tokens_computed = (
1121
+ scheduler .block_manager ._computed_blocks_tracker .
1122
+ _seq_id_to_num_tokens_computed .get (1 ))
1123
+ assert seq_id_to_num_tokens_computed is None
1124
+
1125
+ # Priority preemption schedule
1126
+ scheduler ._schedule_priority_preemption (budget )
1127
+ seq_id_to_num_tokens_computed = (
1128
+ scheduler .block_manager ._computed_blocks_tracker .
1129
+ _seq_id_to_num_tokens_computed .get (1 ))
1130
+ assert seq_id_to_num_tokens_computed is None
1131
+
1132
+ # Prefill scheduler does not schedule batches with prompt tokens and
1133
+ # prompt embeddings co-mingled.
1134
+ block_size = 2
1135
+ max_seq_group = 3
1136
+ scheduler = initialize_scheduler (
1137
+ block_size = block_size ,
1138
+ num_cpu_blocks = 16 ,
1139
+ num_gpu_blocks = 16 ,
1140
+ max_num_seqs = max_seq_group ,
1141
+ max_model_len = 100 ,
1142
+ enable_prefix_caching = True ,
1143
+ )
1144
+ seq_length = 7
1145
+ embedding_size = 5
1146
+ seq_tokens_with_embedding : list [list [int ]] = []
1147
+ seq_embeds : list [Optional [torch .Tensor ]] = []
1148
+
1149
+ seq_tokens_with_embedding .append (list (range (seq_length )))
1150
+ seq_embeds .append (None )
1151
+ seq_tokens_with_embedding .append ([0 ] * seq_length )
1152
+ seq_embeds .append (torch .rand (embedding_size ))
1153
+
1154
+ seq_and_seq_groups = [
1155
+ create_dummy_prompt (f"{ i } " ,
1156
+ prompt_tokens = seq_tokens_with_embedding [i ],
1157
+ prompt_embeds = seq_embeds [i ],
1158
+ block_size = block_size )
1159
+ for i in range (len (seq_tokens_with_embedding ))
1160
+ ]
1161
+
1162
+ for _ , seq_group in seq_and_seq_groups :
1163
+ scheduler .add_seq_group (seq_group )
1164
+
1165
+ scheduler ._schedule_default ()
1166
+ seq_id_to_num_tokens_computed = (
1167
+ scheduler .block_manager ._computed_blocks_tracker .
1168
+ _seq_id_to_num_tokens_computed .get (1 ))
1169
+ assert seq_id_to_num_tokens_computed is None
1170
+
1171
+ # Prefill scheduler budget num_batched_tokens
1172
+ # >= scheduler_config max_num_batched_tokens
1173
+ block_size = 2
1174
+ max_seq_group = 3
1175
+ seq_tokens_prefill_budget : list [list [int ]] = []
1176
+
1177
+ scheduler = initialize_scheduler (
1178
+ block_size = block_size ,
1179
+ max_token_budget = 8 ,
1180
+ num_cpu_blocks = 16 ,
1181
+ num_gpu_blocks = 16 ,
1182
+ max_num_seqs = max_seq_group ,
1183
+ max_model_len = 5 ,
1184
+ enable_prefix_caching = True ,
1185
+ )
1186
+ seq_length = 4
1187
+ num_seqs = 3
1188
+ for i in range (num_seqs ):
1189
+ seq_tokens_prefill_budget .append ([i ] * seq_length )
1190
+
1191
+ seq_and_seq_groups = [
1192
+ create_dummy_prompt (f"{ i } " ,
1193
+ prompt_tokens = seq_tokens_prefill_budget [i ],
1194
+ block_size = block_size )
1195
+ for i in range (len (seq_tokens_prefill_budget ))
1196
+ ]
1197
+
1198
+ for _ , seq_group in seq_and_seq_groups :
1199
+ scheduler .add_seq_group (seq_group )
1200
+
1201
+ scheduler ._schedule_default ()
1202
+ seq_id_to_num_tokens_computed = (
1203
+ scheduler .block_manager ._computed_blocks_tracker .
1204
+ _seq_id_to_num_tokens_computed .get (2 ))
1205
+ assert seq_id_to_num_tokens_computed is None
1206
+
1207
+ # Budget can not schedule in waiting
1208
+ block_size = 2
1209
+ max_seq_group = 3
1210
+
1211
+ scheduler = initialize_scheduler (
1212
+ block_size = block_size ,
1213
+ max_token_budget = 30 ,
1214
+ num_cpu_blocks = 16 ,
1215
+ num_gpu_blocks = 16 ,
1216
+ max_num_seqs = max_seq_group ,
1217
+ max_model_len = 30 ,
1218
+ enable_prefix_caching = True ,
1219
+ )
1220
+ seq_length = 16
1221
+ num_seqs = 3
1222
+ seq_tokens_prefill_budget_waiting : list [list [int ]] = []
1223
+
1224
+ for i in range (num_seqs ):
1225
+ seq_tokens_prefill_budget_waiting .append (list (range (seq_length )))
1226
+
1227
+ seq_and_seq_groups = [
1228
+ create_dummy_prompt (f"{ i } " ,
1229
+ prompt_tokens = seq_tokens_prefill_budget_waiting [i ],
1230
+ block_size = block_size )
1231
+ for i in range (len (seq_tokens_prefill_budget_waiting ))
1232
+ ]
1233
+
1234
+ for _ , seq_group in seq_and_seq_groups :
1235
+ scheduler .add_seq_group (seq_group )
1236
+
1237
+ scheduler ._schedule_default ()
1238
+ seq_id_to_num_tokens_computed = (
1239
+ scheduler .block_manager ._computed_blocks_tracker .
1240
+ _seq_id_to_num_tokens_computed .get (1 ))
1241
+ assert seq_id_to_num_tokens_computed is None
1242
+
1243
+ # Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED
1244
+ block_size = 2
1245
+ max_seq_group = 3
1246
+ scheduler = initialize_scheduler (
1247
+ block_size = block_size ,
1248
+ num_cpu_blocks = 16 ,
1249
+ num_gpu_blocks = 16 ,
1250
+ max_num_seqs = max_seq_group ,
1251
+ max_model_len = 30 ,
1252
+ enable_prefix_caching = True ,
1253
+ )
1254
+
1255
+ seq_length = 31
1256
+ seq_tokens_prompt_limit : list [list [int ]] = []
1257
+ seq_tokens_prompt_limit .append (list (range (seq_length )))
1258
+ seq_and_seq_groups = [
1259
+ create_dummy_prompt ("0" ,
1260
+ prompt_tokens = seq_tokens_prompt_limit [0 ],
1261
+ block_size = block_size )
1262
+ ]
1263
+ for _ , seq_group in seq_and_seq_groups :
1264
+ scheduler .add_seq_group (seq_group )
1265
+ scheduler ._schedule_default ()
1266
+ seq_id_to_num_tokens_computed = (
1267
+ scheduler .block_manager ._computed_blocks_tracker .
1268
+ _seq_id_to_num_tokens_computed .get (0 ))
1269
+ assert seq_id_to_num_tokens_computed is None
1270
+
1271
+ # Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED
1272
+ block_size = 2
1273
+ max_seq_group = 3
1274
+ scheduler = initialize_scheduler (
1275
+ block_size = block_size ,
1276
+ num_cpu_blocks = 160 ,
1277
+ num_gpu_blocks = 160 ,
1278
+ max_num_seqs = max_seq_group ,
1279
+ max_model_len = 320 ,
1280
+ enable_prefix_caching = True ,
1281
+ )
1282
+
1283
+ seq_length = 320
1284
+ num_seqs = 1
1285
+ seq_tokens_never : list [list [int ]] = []
1286
+ for i in range (num_seqs ):
1287
+ seq_tokens_never .append (list (range (seq_length )))
1288
+
1289
+ seq_and_seq_groups = [
1290
+ create_dummy_prompt (f"{ i } " ,
1291
+ prompt_tokens = seq_tokens_never [i ],
1292
+ block_size = block_size )
1293
+ for i in range (len (seq_tokens_never ))
1294
+ ]
1295
+
1296
+ for _ , seq_group in seq_and_seq_groups :
1297
+ scheduler .add_seq_group (seq_group )
1298
+
1299
+ scheduler ._schedule_default ()
1300
+ seq_id_to_num_tokens_computed = (
1301
+ scheduler .block_manager ._computed_blocks_tracker .
1302
+ _seq_id_to_num_tokens_computed .get (0 ))
1303
+ assert seq_id_to_num_tokens_computed is None
1304
+
1305
+ # Budget can not allocate, AllocStatus is LATER
1306
+ block_size = 2
1307
+ max_seq_group = 3
1308
+ scheduler = initialize_scheduler (
1309
+ block_size = block_size ,
1310
+ num_cpu_blocks = 160 ,
1311
+ num_gpu_blocks = 160 ,
1312
+ max_num_seqs = max_seq_group ,
1313
+ max_model_len = 320 ,
1314
+ enable_prefix_caching = True ,
1315
+ )
1316
+
1317
+ seq_length = 160
1318
+ num_seqs = 2
1319
+ seq_tokens_later : list [list [int ]] = []
1320
+ for i in range (num_seqs ):
1321
+ seq_tokens_later .append (list (range (seq_length )))
1322
+
1323
+ seq_and_seq_groups = [
1324
+ create_dummy_prompt (f"{ i } " ,
1325
+ prompt_tokens = seq_tokens_later [i ],
1326
+ block_size = block_size )
1327
+ for i in range (len (seq_tokens_later ))
1328
+ ]
1329
+
1330
+ for _ , seq_group in seq_and_seq_groups :
1331
+ scheduler .add_seq_group (seq_group )
1332
+
1333
+ scheduler ._schedule_default ()
1334
+ seq_id_to_num_tokens_computed = (
1335
+ scheduler .block_manager ._computed_blocks_tracker .
1336
+ _seq_id_to_num_tokens_computed .get (1 ))
1337
+ assert seq_id_to_num_tokens_computed is None
0 commit comments