PaddlePaddle
diff --git a/‎fastdeploy/config.py
Lines changed: 2 additions & 1 deletion b/‎fastdeploy/config.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎fastdeploy/engine/engine.py
Lines changed: 3 additions & 3 deletions b/‎fastdeploy/engine/engine.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎fastdeploy/model_executor/pre_and_post_process.py
Lines changed: 30 additions & 0 deletions b/‎fastdeploy/model_executor/pre_and_post_process.py
Lines changed: 30 additions & 0 deletions
diff --git a/‎fastdeploy/worker/gpu_model_runner.py
Lines changed: 380 additions & 70 deletions b/‎fastdeploy/worker/gpu_model_runner.py
Lines changed: 380 additions & 70 deletions
diff --git a/‎fastdeploy/worker/output.py
Lines changed: 21 additions & 0 deletions b/‎fastdeploy/worker/output.py
Lines changed: 21 additions & 0 deletions
@@ -18,7 +18,7 @@
 
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Literal, Optional, Union
+from typing import Literal, Optional
 
 from paddleformers.transformers.configuration_utils import PretrainedConfig
 from paddleformers.trl import llm_utils
@@ -89,6 +89,7 @@ def __init__(
         self.max_model_len = 0
         self.dtype = ""
         self.enable_logprob = False
+        self.enable_mm = False
 
         for key, value in args.items():
             if hasattr(self, key):
 
@@ -990,8 +990,6 @@ def _start_worker_service(self):
         pd_cmd = pd_cmd + f" --log_dir {log_dir}"
 
         worker_path = "../worker/worker_process.py"
-        if self.cfg.enable_mm:
-            worker_path = "../worker/vl_worker_process.py"
         py_script = os.path.join(current_dir_path, worker_path)
 
         ori_vocab_size = (
@@ -1030,7 +1028,9 @@ def _start_worker_service(self):
             f" --speculative_benchmark_mode {self.cfg.speculative_config.benchmark_mode}"
             f" --max_capture_batch_size {self.cfg.max_capture_batch_size}"
             f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
-            f" --load_strategy {self.cfg.model_config.load_strategy}")
+            f" --load_strategy {self.cfg.model_config.load_strategy}"
+            f" --enable_mm {self.cfg.enable_mm}")
+
 
         worker_append_flag = {
             "enable_expert_parallel":
 
@@ -129,6 +129,36 @@ def post_process_normal(sampler_output: SamplerOutput,
                         save_each_rank: bool = False,
                         skip_save_output: bool = False) -> ModelRunnerOutput:
     """ Post-processing steps after completing a single token generation. """
+    # handle vl:
+    if model_output.enable_thinking:
+        exists_think_end = sampler_output.sampled_token_ids == model_output.think_end_id
+        paddle.assign(
+                paddle.where(
+                    exists_think_end,
+                    model_output.need_think_end - 1,
+                    model_output.need_think_end,
+                ), model_output.need_think_end)
+
+        paddle.assign(
+            paddle.where(
+                model_output.need_think_end.cast("bool"),
+                model_output.reasoning_index - 1,
+                model_output.reasoning_index,
+            ), model_output.reasoning_index)
+
+        stop_wo_think = (
+            (sampler_output.sampled_token_ids == model_output.eos_token_id) |
+            (model_output.reasoning_index == 0)) & (
+                model_output.need_think_end > 0)
+        sampler_output.sampled_token_ids = paddle.where(stop_wo_think,
+                                    model_output.think_end_id,
+                                    sampler_output.sampled_token_ids)
+        paddle.assign(
+            paddle.where(
+                stop_wo_think,
+                model_output.need_think_end - 1,
+                model_output.need_think_end,
+            ), model_output.need_think_end)
     # 1. Set stop value
     paddle.assign(
         paddle.where(
 
@@ -201,6 +201,27 @@ class ModelOutputData:
     """
     accept_num: paddle.Tensor
 
+    """
+        vl model enable to think
+    """
+    enable_thinking: paddle.Tensor = None
+
+    """
+        vl model think end id
+    """
+    think_end_id: int = -1
+
+    """
+        vl model need to think
+    """
+    need_think_end: paddle.Tensor = None
+
+    """
+        vl model reasoning index
+    """
+    reasoning_index: paddle.Tensor = None
+
+
 
 @dataclass
 class ModelRunnerOutput: