Skip to content

Commit a768781

Browse files
quanliu1991刘全
authored andcommitted
[Bugfix][Core] Prefix caching causes incorrect outputs due to outdated ComputedBlocksTracker (vllm-project#18957)
Signed-off-by: 刘全 <quan.liu2@dbappsecurity.com.cn> Co-authored-by: 刘全 <quan.liu2@dbappsecurity.com.cn> Signed-off-by: Yang Wang <elainewy@meta.com>
1 parent 9fcd2dd commit a768781

File tree

3 files changed

+331
-0
lines changed

3 files changed

+331
-0
lines changed

tests/core/test_scheduler.py

Lines changed: 294 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,3 +1041,297 @@ def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
10411041
for seq in scheduled_seq_group.seq_group.seqs:
10421042
seq.status = SequenceStatus.FINISHED_STOPPED
10431043
scheduler.free_finished_seq_groups()
1044+
1045+
1046+
def test_remove_seq_from_computed_blocks_tracker():
1047+
"""
1048+
Test that computed_blocks_tracker correctly removes stale sequences
1049+
during scheduling.
1050+
1051+
The test covers 9 scheduling branches where stale seqs are removed:
1052+
- 1 in _schedule_swapped
1053+
- 1 in _schedule_priority_preemption
1054+
- 7 in _schedule_prefill
1055+
1056+
Each branch is tested to ensure proper cleanup of
1057+
_seq_id_to_num_tokens_computed.
1058+
"""
1059+
# Budget can not schedule in swapped
1060+
block_size = 2
1061+
max_seq_group = 3
1062+
seq_tokens_with_swapped: list[list[int]] = []
1063+
blocks_to_swap_out: list[tuple[int, int]] = []
1064+
curr_loras: set[int] = set()
1065+
1066+
scheduler = initialize_scheduler(
1067+
block_size=block_size,
1068+
num_cpu_blocks=64,
1069+
num_gpu_blocks=16,
1070+
max_num_seqs=max_seq_group,
1071+
enable_prefix_caching=True,
1072+
)
1073+
budget = create_token_budget(token_budget=15)
1074+
1075+
seq_length = 16
1076+
num_seqs = 3
1077+
for i in range(num_seqs):
1078+
seq_tokens_with_swapped.append([i] * seq_length)
1079+
1080+
seq_and_seq_groups = [
1081+
create_dummy_prompt(f"{i}",
1082+
prompt_tokens=seq_tokens_with_swapped[i],
1083+
block_size=block_size)
1084+
for i in range(len(seq_tokens_with_swapped))
1085+
]
1086+
1087+
for _, seq_group in seq_and_seq_groups:
1088+
scheduler._allocate_and_set_running(seq_group)
1089+
scheduler._swap_out(seq_group, blocks_to_swap_out)
1090+
scheduler._add_seq_group_to_swapped(seq_group)
1091+
1092+
scheduler._schedule_swapped(budget, curr_loras)
1093+
seq_id_to_num_tokens_computed = (
1094+
scheduler.block_manager._computed_blocks_tracker.
1095+
_seq_id_to_num_tokens_computed.get(1))
1096+
assert seq_id_to_num_tokens_computed is None
1097+
1098+
# Prefill schedule don't have a space for another LoRA, so
1099+
# we ignore this request for now.
1100+
block_size = 4
1101+
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
1102+
scheduler = initialize_scheduler(lora_config=lora_config,
1103+
block_size=block_size,
1104+
num_cpu_blocks=64,
1105+
num_gpu_blocks=64,
1106+
enable_prefix_caching=True)
1107+
budget = create_token_budget(token_budget=120)
1108+
num_seqs = 2
1109+
for i in range(num_seqs):
1110+
_, seq_group = create_dummy_prompt(str(i),
1111+
prompt_length=seq_length,
1112+
block_size=block_size,
1113+
lora_request=LoRARequest(
1114+
lora_name=str(i),
1115+
lora_int_id=i + 1,
1116+
lora_path="abc"))
1117+
scheduler.add_seq_group(seq_group)
1118+
1119+
scheduler._schedule_prefills(budget, curr_loras)
1120+
seq_id_to_num_tokens_computed = (
1121+
scheduler.block_manager._computed_blocks_tracker.
1122+
_seq_id_to_num_tokens_computed.get(1))
1123+
assert seq_id_to_num_tokens_computed is None
1124+
1125+
# Priority preemption schedule
1126+
scheduler._schedule_priority_preemption(budget)
1127+
seq_id_to_num_tokens_computed = (
1128+
scheduler.block_manager._computed_blocks_tracker.
1129+
_seq_id_to_num_tokens_computed.get(1))
1130+
assert seq_id_to_num_tokens_computed is None
1131+
1132+
# Prefill scheduler does not schedule batches with prompt tokens and
1133+
# prompt embeddings co-mingled.
1134+
block_size = 2
1135+
max_seq_group = 3
1136+
scheduler = initialize_scheduler(
1137+
block_size=block_size,
1138+
num_cpu_blocks=16,
1139+
num_gpu_blocks=16,
1140+
max_num_seqs=max_seq_group,
1141+
max_model_len=100,
1142+
enable_prefix_caching=True,
1143+
)
1144+
seq_length = 7
1145+
embedding_size = 5
1146+
seq_tokens_with_embedding: list[list[int]] = []
1147+
seq_embeds: list[Optional[torch.Tensor]] = []
1148+
1149+
seq_tokens_with_embedding.append(list(range(seq_length)))
1150+
seq_embeds.append(None)
1151+
seq_tokens_with_embedding.append([0] * seq_length)
1152+
seq_embeds.append(torch.rand(embedding_size))
1153+
1154+
seq_and_seq_groups = [
1155+
create_dummy_prompt(f"{i}",
1156+
prompt_tokens=seq_tokens_with_embedding[i],
1157+
prompt_embeds=seq_embeds[i],
1158+
block_size=block_size)
1159+
for i in range(len(seq_tokens_with_embedding))
1160+
]
1161+
1162+
for _, seq_group in seq_and_seq_groups:
1163+
scheduler.add_seq_group(seq_group)
1164+
1165+
scheduler._schedule_default()
1166+
seq_id_to_num_tokens_computed = (
1167+
scheduler.block_manager._computed_blocks_tracker.
1168+
_seq_id_to_num_tokens_computed.get(1))
1169+
assert seq_id_to_num_tokens_computed is None
1170+
1171+
# Prefill scheduler budget num_batched_tokens
1172+
# >= scheduler_config max_num_batched_tokens
1173+
block_size = 2
1174+
max_seq_group = 3
1175+
seq_tokens_prefill_budget: list[list[int]] = []
1176+
1177+
scheduler = initialize_scheduler(
1178+
block_size=block_size,
1179+
max_token_budget=8,
1180+
num_cpu_blocks=16,
1181+
num_gpu_blocks=16,
1182+
max_num_seqs=max_seq_group,
1183+
max_model_len=5,
1184+
enable_prefix_caching=True,
1185+
)
1186+
seq_length = 4
1187+
num_seqs = 3
1188+
for i in range(num_seqs):
1189+
seq_tokens_prefill_budget.append([i] * seq_length)
1190+
1191+
seq_and_seq_groups = [
1192+
create_dummy_prompt(f"{i}",
1193+
prompt_tokens=seq_tokens_prefill_budget[i],
1194+
block_size=block_size)
1195+
for i in range(len(seq_tokens_prefill_budget))
1196+
]
1197+
1198+
for _, seq_group in seq_and_seq_groups:
1199+
scheduler.add_seq_group(seq_group)
1200+
1201+
scheduler._schedule_default()
1202+
seq_id_to_num_tokens_computed = (
1203+
scheduler.block_manager._computed_blocks_tracker.
1204+
_seq_id_to_num_tokens_computed.get(2))
1205+
assert seq_id_to_num_tokens_computed is None
1206+
1207+
# Budget can not schedule in waiting
1208+
block_size = 2
1209+
max_seq_group = 3
1210+
1211+
scheduler = initialize_scheduler(
1212+
block_size=block_size,
1213+
max_token_budget=30,
1214+
num_cpu_blocks=16,
1215+
num_gpu_blocks=16,
1216+
max_num_seqs=max_seq_group,
1217+
max_model_len=30,
1218+
enable_prefix_caching=True,
1219+
)
1220+
seq_length = 16
1221+
num_seqs = 3
1222+
seq_tokens_prefill_budget_waiting: list[list[int]] = []
1223+
1224+
for i in range(num_seqs):
1225+
seq_tokens_prefill_budget_waiting.append(list(range(seq_length)))
1226+
1227+
seq_and_seq_groups = [
1228+
create_dummy_prompt(f"{i}",
1229+
prompt_tokens=seq_tokens_prefill_budget_waiting[i],
1230+
block_size=block_size)
1231+
for i in range(len(seq_tokens_prefill_budget_waiting))
1232+
]
1233+
1234+
for _, seq_group in seq_and_seq_groups:
1235+
scheduler.add_seq_group(seq_group)
1236+
1237+
scheduler._schedule_default()
1238+
seq_id_to_num_tokens_computed = (
1239+
scheduler.block_manager._computed_blocks_tracker.
1240+
_seq_id_to_num_tokens_computed.get(1))
1241+
assert seq_id_to_num_tokens_computed is None
1242+
1243+
# Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED
1244+
block_size = 2
1245+
max_seq_group = 3
1246+
scheduler = initialize_scheduler(
1247+
block_size=block_size,
1248+
num_cpu_blocks=16,
1249+
num_gpu_blocks=16,
1250+
max_num_seqs=max_seq_group,
1251+
max_model_len=30,
1252+
enable_prefix_caching=True,
1253+
)
1254+
1255+
seq_length = 31
1256+
seq_tokens_prompt_limit: list[list[int]] = []
1257+
seq_tokens_prompt_limit.append(list(range(seq_length)))
1258+
seq_and_seq_groups = [
1259+
create_dummy_prompt("0",
1260+
prompt_tokens=seq_tokens_prompt_limit[0],
1261+
block_size=block_size)
1262+
]
1263+
for _, seq_group in seq_and_seq_groups:
1264+
scheduler.add_seq_group(seq_group)
1265+
scheduler._schedule_default()
1266+
seq_id_to_num_tokens_computed = (
1267+
scheduler.block_manager._computed_blocks_tracker.
1268+
_seq_id_to_num_tokens_computed.get(0))
1269+
assert seq_id_to_num_tokens_computed is None
1270+
1271+
# Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED
1272+
block_size = 2
1273+
max_seq_group = 3
1274+
scheduler = initialize_scheduler(
1275+
block_size=block_size,
1276+
num_cpu_blocks=160,
1277+
num_gpu_blocks=160,
1278+
max_num_seqs=max_seq_group,
1279+
max_model_len=320,
1280+
enable_prefix_caching=True,
1281+
)
1282+
1283+
seq_length = 320
1284+
num_seqs = 1
1285+
seq_tokens_never: list[list[int]] = []
1286+
for i in range(num_seqs):
1287+
seq_tokens_never.append(list(range(seq_length)))
1288+
1289+
seq_and_seq_groups = [
1290+
create_dummy_prompt(f"{i}",
1291+
prompt_tokens=seq_tokens_never[i],
1292+
block_size=block_size)
1293+
for i in range(len(seq_tokens_never))
1294+
]
1295+
1296+
for _, seq_group in seq_and_seq_groups:
1297+
scheduler.add_seq_group(seq_group)
1298+
1299+
scheduler._schedule_default()
1300+
seq_id_to_num_tokens_computed = (
1301+
scheduler.block_manager._computed_blocks_tracker.
1302+
_seq_id_to_num_tokens_computed.get(0))
1303+
assert seq_id_to_num_tokens_computed is None
1304+
1305+
# Budget can not allocate, AllocStatus is LATER
1306+
block_size = 2
1307+
max_seq_group = 3
1308+
scheduler = initialize_scheduler(
1309+
block_size=block_size,
1310+
num_cpu_blocks=160,
1311+
num_gpu_blocks=160,
1312+
max_num_seqs=max_seq_group,
1313+
max_model_len=320,
1314+
enable_prefix_caching=True,
1315+
)
1316+
1317+
seq_length = 160
1318+
num_seqs = 2
1319+
seq_tokens_later: list[list[int]] = []
1320+
for i in range(num_seqs):
1321+
seq_tokens_later.append(list(range(seq_length)))
1322+
1323+
seq_and_seq_groups = [
1324+
create_dummy_prompt(f"{i}",
1325+
prompt_tokens=seq_tokens_later[i],
1326+
block_size=block_size)
1327+
for i in range(len(seq_tokens_later))
1328+
]
1329+
1330+
for _, seq_group in seq_and_seq_groups:
1331+
scheduler.add_seq_group(seq_group)
1332+
1333+
scheduler._schedule_default()
1334+
seq_id_to_num_tokens_computed = (
1335+
scheduler.block_manager._computed_blocks_tracker.
1336+
_seq_id_to_num_tokens_computed.get(1))
1337+
assert seq_id_to_num_tokens_computed is None

vllm/core/block_manager.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,10 @@ def free(self, seq: Sequence) -> None:
270270
self.block_tables[seq_id].free()
271271
del self.block_tables[seq_id]
272272

273+
def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
274+
seq_id = seq.seq_id
275+
self._computed_blocks_tracker.remove_seq(seq_id)
276+
273277
def free_cross(self, seq_group: SequenceGroup) -> None:
274278
request_id = seq_group.request_id
275279
if request_id not in self.cross_block_tables:

0 commit comments

Comments
 (0)