Skip to content

Commit e7bcbba

Browse files
authored
Merge vl execution path into normal execution path (#2829)
* merge vl model into gpu_model runner Change-Id: I9f4691a3d5f135e8d72b1d58abcd15ef3aa3f2a6 * fix chinese Change-Id: Ic7405109b984c21e076fb3b01ff6feb571d0119a * fix the parse parameter Change-Id: I4cd62ee87c06220af580d91e347145d4394917fe * fix the bug in online_inference Change-Id: Idb111bb2114e83017c4050b2a68cf039c6d3c559 * polish code Change-Id: I7d4194102c2f1b0743b74fbd5fc284eb8ef4d17c
1 parent 5fc659b commit e7bcbba

File tree

9 files changed

+442
-1733
lines changed

9 files changed

+442
-1733
lines changed

fastdeploy/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
from dataclasses import dataclass, field
2020
from enum import Enum
21-
from typing import Literal, Optional, Union
21+
from typing import Literal, Optional
2222

2323
from paddleformers.transformers.configuration_utils import PretrainedConfig
2424
from paddleformers.trl import llm_utils
@@ -89,6 +89,7 @@ def __init__(
8989
self.max_model_len = 0
9090
self.dtype = ""
9191
self.enable_logprob = False
92+
self.enable_mm = False
9293

9394
for key, value in args.items():
9495
if hasattr(self, key):

fastdeploy/engine/engine.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -990,8 +990,6 @@ def _start_worker_service(self):
990990
pd_cmd = pd_cmd + f" --log_dir {log_dir}"
991991

992992
worker_path = "../worker/worker_process.py"
993-
if self.cfg.enable_mm:
994-
worker_path = "../worker/vl_worker_process.py"
995993
py_script = os.path.join(current_dir_path, worker_path)
996994

997995
ori_vocab_size = (
@@ -1030,7 +1028,9 @@ def _start_worker_service(self):
10301028
f" --speculative_benchmark_mode {self.cfg.speculative_config.benchmark_mode}"
10311029
f" --max_capture_batch_size {self.cfg.max_capture_batch_size}"
10321030
f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
1033-
f" --load_strategy {self.cfg.model_config.load_strategy}")
1031+
f" --load_strategy {self.cfg.model_config.load_strategy}"
1032+
f" --enable_mm {self.cfg.enable_mm}")
1033+
10341034

10351035
worker_append_flag = {
10361036
"enable_expert_parallel":

fastdeploy/model_executor/pre_and_post_process.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,36 @@ def post_process_normal(sampler_output: SamplerOutput,
129129
save_each_rank: bool = False,
130130
skip_save_output: bool = False) -> ModelRunnerOutput:
131131
""" Post-processing steps after completing a single token generation. """
132+
# handle vl:
133+
if model_output.enable_thinking:
134+
exists_think_end = sampler_output.sampled_token_ids == model_output.think_end_id
135+
paddle.assign(
136+
paddle.where(
137+
exists_think_end,
138+
model_output.need_think_end - 1,
139+
model_output.need_think_end,
140+
), model_output.need_think_end)
141+
142+
paddle.assign(
143+
paddle.where(
144+
model_output.need_think_end.cast("bool"),
145+
model_output.reasoning_index - 1,
146+
model_output.reasoning_index,
147+
), model_output.reasoning_index)
148+
149+
stop_wo_think = (
150+
(sampler_output.sampled_token_ids == model_output.eos_token_id) |
151+
(model_output.reasoning_index == 0)) & (
152+
model_output.need_think_end > 0)
153+
sampler_output.sampled_token_ids = paddle.where(stop_wo_think,
154+
model_output.think_end_id,
155+
sampler_output.sampled_token_ids)
156+
paddle.assign(
157+
paddle.where(
158+
stop_wo_think,
159+
model_output.need_think_end - 1,
160+
model_output.need_think_end,
161+
), model_output.need_think_end)
132162
# 1. Set stop value
133163
paddle.assign(
134164
paddle.where(

fastdeploy/worker/gpu_model_runner.py

Lines changed: 380 additions & 70 deletions
Large diffs are not rendered by default.

fastdeploy/worker/output.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,27 @@ class ModelOutputData:
201201
"""
202202
accept_num: paddle.Tensor
203203

204+
"""
205+
vl model enable to think
206+
"""
207+
enable_thinking: paddle.Tensor = None
208+
209+
"""
210+
vl model think end id
211+
"""
212+
think_end_id: int = -1
213+
214+
"""
215+
vl model need to think
216+
"""
217+
need_think_end: paddle.Tensor = None
218+
219+
"""
220+
vl model reasoning index
221+
"""
222+
reasoning_index: paddle.Tensor = None
223+
224+
204225

205226
@dataclass
206227
class ModelRunnerOutput:

0 commit comments

Comments
 (0)