File tree Expand file tree Collapse file tree 2 files changed +23
-1
lines changed
vllm/engine/output_processor Expand file tree Collapse file tree 2 files changed +23
-1
lines changed Original file line number Diff line number Diff line change 25
25
]
26
26
27
27
28
+ @pytest .fixture (autouse = True )
29
+ def v1 (run_with_both_engines ):
30
+ """We can run both engines for this test."""
31
+ pass
32
+
33
+
28
34
@pytest .fixture (scope = "module" )
29
35
def llm ():
30
36
# pytest caches the fixture so we use weakref.proxy to
@@ -104,3 +110,19 @@ def test_multiple_sampling_params(llm: LLM):
104
110
# sampling_params is None, default params should be applied
105
111
outputs = llm .generate (PROMPTS , sampling_params = None )
106
112
assert len (PROMPTS ) == len (outputs )
113
+
114
+
115
+ def test_max_model_len ():
116
+ max_model_len = 20
117
+ llm = LLM (
118
+ model = MODEL_NAME ,
119
+ max_model_len = max_model_len ,
120
+ gpu_memory_utilization = 0.10 ,
121
+ enforce_eager = True , # reduce test time
122
+ )
123
+ sampling_params = SamplingParams (max_tokens = max_model_len + 10 )
124
+ outputs = llm .generate (PROMPTS , sampling_params )
125
+ for output in outputs :
126
+ num_total_tokens = len (output .prompt_token_ids ) + len (
127
+ output .outputs [0 ].token_ids )
128
+ assert num_total_tokens == max_model_len
Original file line number Diff line number Diff line change @@ -82,7 +82,7 @@ def maybe_stop_sequence(
82
82
return
83
83
84
84
# Check if the sequence has reached max_model_len.
85
- if seq .get_len () > self ._get_max_model_len (lora_req ):
85
+ if seq .get_len () >= self ._get_max_model_len (lora_req ):
86
86
seq .status = SequenceStatus .FINISHED_LENGTH_CAPPED
87
87
return
88
88
You can’t perform that action at this time.
0 commit comments