fix xpu offline demo garbled output (#2763)

yulangz · web-flow · commit 0350831c2b70 · 2025-07-09T14:51:20.000+08:00
diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py
@@ -142,7 +142,8 @@ def xpu_process_output(
 
 
 def xpu_post_process(sampled_token_ids: paddle.Tensor,
-                     model_output: ModelOutputData) -> None:
+                     model_output: ModelOutputData,
+                     skip_save_output: bool) -> None:
     """
 
     """
@@ -185,12 +186,13 @@ def xpu_post_process(sampled_token_ids: paddle.Tensor,
         )
     # 3. Transmit the model's output and stop generation signal via message queue.
     #    In the future, we will abandon this approach.
-    save_output(
-        sampled_token_ids,
-        model_output.not_need_stop,
-        model_output.mp_rank,
-        False,  # use_ep
-    )
+    if not skip_save_output:
+        save_output(
+            sampled_token_ids,
+            model_output.not_need_stop,
+            model_output.mp_rank,
+            False,  # use_ep
+        )
 
 
 def step_paddle(share_inputs: Dict[str, paddle.Tensor], block_size: int,
@@ -658,14 +660,15 @@ def _dummy_run(self,
         self._dummy_prefill_inputs(num_tokens, batch_size)
 
         while True:
-            self.execute_model(None)
+            self.execute_model(None, True)
 
             if int((self.share_inputs['seq_lens_this_time'] > 0).sum()) == 0:
                 break
 
     def execute_model(
         self,
         model_forward_batch: Optional[List[Request]] = None,
+        is_dummy_run: bool = False,
     ) -> Optional[ModelRunnerOutput]:
         """
         The Entrance of model execute.
@@ -721,7 +724,8 @@ class at the server level, which is too granular for ModelRunner.
             accept_num=None,
         )
         xpu_post_process(sampled_token_ids=sampled_token_ids,
-                         model_output=model_output_data)
+                         model_output=model_output_data,
+                         skip_save_output=is_dummy_run)
 
         # 7. Updata 'infer_seed' and step_paddle()
         self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
diff --git a/fastdeploy/worker/xpu_worker.py b/fastdeploy/worker/xpu_worker.py
@@ -86,7 +86,6 @@ def determine_available_memory(self) -> int:
             You may limit the usage of GPU memory
             by adjusting the `gpu_memory_utilization` parameter.
         """
-        # logger.warn("XPU current could not determine available memory")
         from fastdeploy.model_executor.ops.xpu import \
             xpu_get_free_global_memory, xpu_get_total_global_memory, xpu_get_used_global_memory