31
31
from vllm .v1 .structured_output import StructuredOutputManager
32
32
33
33
from vllm_ascend .core .scheduler import AscendScheduler
34
+ from vllm_ascend .utils import vllm_version_is
34
35
35
36
EOS_TOKEN_ID = 50256
36
37
@@ -83,11 +84,10 @@ def create_scheduler(
83
84
cache_dtype = "auto" ,
84
85
** kwargs_cache ,
85
86
)
86
- vllm_config = VllmConfig (
87
- scheduler_config = scheduler_config ,
88
- model_config = model_config ,
89
- cache_config = cache_config ,
90
- )
87
+ vllm_config = VllmConfig (scheduler_config = scheduler_config ,
88
+ model_config = model_config ,
89
+ cache_config = cache_config )
90
+
91
91
kv_cache_config = KVCacheConfig (
92
92
num_blocks = 10000 , # A large number of blocks to hold all requests
93
93
tensors = {},
@@ -98,10 +98,7 @@ def create_scheduler(
98
98
)
99
99
cache_config .num_gpu_blocks = 10000
100
100
return AscendScheduler (
101
- scheduler_config ,
102
- model_config ,
103
- cache_config ,
104
- lora_config = None ,
101
+ vllm_config ,
105
102
kv_cache_config = kv_cache_config ,
106
103
log_stats = True ,
107
104
structured_output_manager = StructuredOutputManager (vllm_config ),
@@ -126,17 +123,27 @@ def create_requests(num_requests: int,
126
123
else :
127
124
mm_position = None
128
125
mm_inputs = None
129
- request = Request (
130
- request_id = f"{ i } " ,
131
- prompt = None ,
132
- prompt_token_ids = [i ] * num_tokens ,
133
- sampling_params = sampling_params ,
134
- multi_modal_inputs = mm_inputs ,
135
- multi_modal_placeholders = mm_position ,
136
- multi_modal_hashes = None ,
137
- eos_token_id = EOS_TOKEN_ID ,
138
- arrival_time = 0 ,
139
- )
126
+ if vllm_version_is ("0.9.0" ):
127
+ request = Request (
128
+ request_id = f"{ i } " ,
129
+ prompt_token_ids = [i ] * num_tokens ,
130
+ sampling_params = sampling_params ,
131
+ multi_modal_inputs = mm_inputs ,
132
+ multi_modal_placeholders = mm_position ,
133
+ multi_modal_hashes = None ,
134
+ arrival_time = 0 ,
135
+ eos_token_id = EOS_TOKEN_ID ,
136
+ )
137
+ else :
138
+ request = Request (
139
+ request_id = f"{ i } " ,
140
+ prompt_token_ids = [i ] * num_tokens ,
141
+ sampling_params = sampling_params ,
142
+ multi_modal_inputs = mm_inputs ,
143
+ multi_modal_placeholders = mm_position ,
144
+ multi_modal_hashes = None ,
145
+ eos_token_id = EOS_TOKEN_ID ,
146
+ )
140
147
requests .append (request )
141
148
return requests
142
149
@@ -225,12 +232,9 @@ def test_stop_via_update_from_output():
225
232
requests [0 ].request_id : 1 ,
226
233
requests [1 ].request_id : 2
227
234
},
235
+ scheduled_spec_decode_tokens = {},
228
236
total_num_scheduled_tokens = 3 ,
229
237
scheduled_encoder_inputs = {},
230
- scheduled_spec_decode_tokens = {
231
- requests [0 ].request_id : [],
232
- requests [1 ].request_id : [10 ]
233
- },
234
238
num_common_prefix_blocks = 0 ,
235
239
finished_req_ids = set (),
236
240
free_encoder_input_ids = [],
@@ -275,12 +279,9 @@ def test_stop_via_update_from_output():
275
279
requests [0 ].request_id : 3 ,
276
280
requests [1 ].request_id : 2
277
281
},
282
+ scheduled_spec_decode_tokens = {},
278
283
total_num_scheduled_tokens = 5 ,
279
284
scheduled_encoder_inputs = {},
280
- scheduled_spec_decode_tokens = {
281
- requests [0 ].request_id : [10 , 42 ],
282
- requests [1 ].request_id : [13 ]
283
- },
284
285
num_common_prefix_blocks = 0 ,
285
286
finished_req_ids = set (),
286
287
free_encoder_input_ids = [],
@@ -323,12 +324,9 @@ def test_stop_via_update_from_output():
323
324
requests [0 ].request_id : 3 ,
324
325
requests [1 ].request_id : 1
325
326
},
327
+ scheduled_spec_decode_tokens = {},
326
328
total_num_scheduled_tokens = 4 ,
327
329
scheduled_encoder_inputs = {},
328
- scheduled_spec_decode_tokens = {
329
- requests [0 ].request_id : [10 , 11 ],
330
- requests [1 ].request_id : []
331
- },
332
330
num_common_prefix_blocks = 0 ,
333
331
finished_req_ids = set (),
334
332
free_encoder_input_ids = [],
@@ -369,11 +367,9 @@ def test_stop_via_update_from_output():
369
367
scheduled_new_reqs = [],
370
368
scheduled_cached_reqs = [],
371
369
num_scheduled_tokens = {requests [0 ].request_id : 3 },
370
+ scheduled_spec_decode_tokens = {},
372
371
total_num_scheduled_tokens = 3 ,
373
372
scheduled_encoder_inputs = {},
374
- scheduled_spec_decode_tokens = {
375
- requests [0 ].request_id : [EOS_TOKEN_ID , 10 ]
376
- },
377
373
num_common_prefix_blocks = 0 ,
378
374
finished_req_ids = set (),
379
375
free_encoder_input_ids = [],
0 commit comments