9
9
from vllm .config import (CacheConfig , KVTransferConfig , ModelConfig ,
10
10
SchedulerConfig , SpeculativeConfig , VllmConfig )
11
11
from vllm .multimodal .inputs import MultiModalKwargs , PlaceholderRange
12
- from vllm .sampling_params import SamplingParams
12
+ from vllm .sampling_params import GuidedDecodingParams , SamplingParams
13
13
from vllm .v1 .core .sched .output import CachedRequestData , SchedulerOutput
14
14
from vllm .v1 .core .sched .scheduler import Scheduler
15
15
from vllm .v1 .kv_cache_interface import (FullAttentionSpec , KVCacheConfig ,
16
16
KVCacheGroupSpec )
17
17
from vllm .v1 .outputs import ModelRunnerOutput
18
18
from vllm .v1 .request import Request , RequestStatus
19
19
from vllm .v1 .structured_output import StructuredOutputManager
20
+ from vllm .v1 .structured_output .request import StructuredOutputRequest
20
21
21
22
EOS_TOKEN_ID = 50256
22
23
@@ -33,6 +34,7 @@ def create_scheduler(
33
34
block_size : int = 16 ,
34
35
max_model_len : Optional [int ] = None ,
35
36
num_speculative_tokens : Optional [int ] = None ,
37
+ skip_tokenizer_init : bool = False ,
36
38
) -> Scheduler :
37
39
'''Create scheduler under test.
38
40
@@ -65,6 +67,7 @@ def create_scheduler(
65
67
trust_remote_code = True ,
66
68
dtype = "float16" ,
67
69
seed = 42 ,
70
+ skip_tokenizer_init = skip_tokenizer_init ,
68
71
)
69
72
# Cache config, optionally force APC
70
73
kwargs_cache = ({} if enable_prefix_caching is None else {
@@ -186,7 +189,7 @@ def test_get_num_unfinished_requests():
186
189
])
187
190
def test_schedule (enable_prefix_caching : Optional [bool ],
188
191
prompt_logprobs : Optional [int ]):
189
- '''Test scheduling.
192
+ '''Test scheduling.
190
193
Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
191
194
'''
192
195
scheduler = create_scheduler (enable_prefix_caching = enable_prefix_caching )
@@ -1408,7 +1411,7 @@ def create_requests_with_priority(
1408
1411
1409
1412
1410
1413
def test_priority_scheduling_basic_ordering ():
1411
- """Test that requests are scheduled in priority order
1414
+ """Test that requests are scheduled in priority order
1412
1415
(lower value = higher priority)."""
1413
1416
scheduler = create_scheduler_with_priority ()
1414
1417
@@ -1437,7 +1440,7 @@ def test_priority_scheduling_basic_ordering():
1437
1440
1438
1441
1439
1442
def test_priority_scheduling_arrival_time_tiebreaker ():
1440
- """Test that arrival time is used
1443
+ """Test that arrival time is used
1441
1444
as tiebreaker when priorities are equal."""
1442
1445
scheduler = create_scheduler_with_priority ()
1443
1446
@@ -1495,7 +1498,7 @@ def test_priority_scheduling_mixed_priority_and_arrival():
1495
1498
1496
1499
1497
1500
def test_priority_scheduling_preemption ():
1498
- """Test that priority scheduling preempts
1501
+ """Test that priority scheduling preempts
1499
1502
lower priority requests when memory is constrained."""
1500
1503
# Create scheduler with very limited memory to force preemption
1501
1504
scheduler = create_scheduler_with_priority (
@@ -1576,7 +1579,7 @@ def test_priority_scheduling_preemption():
1576
1579
1577
1580
1578
1581
def test_priority_scheduling_no_preemption_when_space_available ():
1579
- """Test that preemption doesn't happen
1582
+ """Test that preemption doesn't happen
1580
1583
when there's space for new requests."""
1581
1584
scheduler = create_scheduler_with_priority (
1582
1585
max_num_seqs = 3 , # Allow 3 concurrent requests
@@ -1626,7 +1629,7 @@ def test_priority_scheduling_no_preemption_when_space_available():
1626
1629
1627
1630
1628
1631
def test_priority_scheduling_preemption_victim_selection ():
1629
- """Test that the correct victim is selected for
1632
+ """Test that the correct victim is selected for
1630
1633
preemption based on priority and arrival time."""
1631
1634
# This test verifies the priority-based victim selection logic
1632
1635
# by checking the waiting queue order after adding requests with different
@@ -1743,7 +1746,7 @@ def test_priority_scheduling_waiting_queue_order():
1743
1746
1744
1747
1745
1748
def test_priority_scheduling_fcfs_fallback ():
1746
- """Test that FCFS behavior is maintained when all
1749
+ """Test that FCFS behavior is maintained when all
1747
1750
requests have same priority."""
1748
1751
scheduler = create_scheduler_with_priority ()
1749
1752
@@ -1811,7 +1814,7 @@ def test_priority_scheduling_with_limited_slots():
1811
1814
1812
1815
1813
1816
def test_priority_scheduling_heap_property ():
1814
- """Test that the waiting queue maintains heap
1817
+ """Test that the waiting queue maintains heap
1815
1818
property for priority scheduling."""
1816
1819
scheduler = create_scheduler_with_priority (
1817
1820
max_num_seqs = 1 , # Only one request can run at a time
@@ -1857,3 +1860,39 @@ def test_priority_scheduling_heap_property():
1857
1860
# Verify requests were scheduled in priority order (lowest value first)
1858
1861
expected_priorities = sorted (priorities )
1859
1862
assert scheduled_priorities == expected_priorities
1863
+
1864
+
1865
+ def test_schedule_skip_tokenizer_init ():
1866
+ scheduler = create_scheduler (skip_tokenizer_init = True )
1867
+ requests = create_requests (num_requests = 5 )
1868
+ for request in requests :
1869
+ scheduler .add_request (request )
1870
+ output = scheduler .schedule ()
1871
+ assert len (output .scheduled_new_reqs ) == len (requests )
1872
+ assert output .grammar_bitmask is None
1873
+
1874
+
1875
+ def test_schedule_skip_tokenizer_init_structured_output_request ():
1876
+ scheduler = create_scheduler (skip_tokenizer_init = True )
1877
+ guided_params = GuidedDecodingParams (regex = "[0-9]+" )
1878
+ sampling_params = SamplingParams (
1879
+ ignore_eos = False ,
1880
+ max_tokens = 16 ,
1881
+ guided_decoding = guided_params ,
1882
+ )
1883
+ request = Request (
1884
+ request_id = "0" ,
1885
+ prompt_token_ids = [0 , 1 ],
1886
+ multi_modal_inputs = None ,
1887
+ multi_modal_hashes = None ,
1888
+ multi_modal_placeholders = None ,
1889
+ sampling_params = sampling_params ,
1890
+ pooling_params = None ,
1891
+ eos_token_id = EOS_TOKEN_ID ,
1892
+ structured_output_request = StructuredOutputRequest (sampling_params ),
1893
+ )
1894
+ scheduler .add_request (request )
1895
+ output = scheduler .schedule ()
1896
+ assert len (output .scheduled_new_reqs ) == 0
1897
+ assert len (scheduler .running ) == 0
1898
+ assert len (scheduler .waiting ) == 1
0 commit comments