@@ -142,7 +142,8 @@ def xpu_process_output(
142
142
143
143
144
144
def xpu_post_process (sampled_token_ids : paddle .Tensor ,
145
- model_output : ModelOutputData ) -> None :
145
+ model_output : ModelOutputData ,
146
+ skip_save_output : bool ) -> None :
146
147
"""
147
148
148
149
"""
@@ -185,12 +186,13 @@ def xpu_post_process(sampled_token_ids: paddle.Tensor,
185
186
)
186
187
# 3. Transmit the model's output and stop generation signal via message queue.
187
188
# In the future, we will abandon this approach.
188
- save_output (
189
- sampled_token_ids ,
190
- model_output .not_need_stop ,
191
- model_output .mp_rank ,
192
- False , # use_ep
193
- )
189
+ if not skip_save_output :
190
+ save_output (
191
+ sampled_token_ids ,
192
+ model_output .not_need_stop ,
193
+ model_output .mp_rank ,
194
+ False , # use_ep
195
+ )
194
196
195
197
196
198
def step_paddle (share_inputs : Dict [str , paddle .Tensor ], block_size : int ,
@@ -658,14 +660,15 @@ def _dummy_run(self,
658
660
self ._dummy_prefill_inputs (num_tokens , batch_size )
659
661
660
662
while True :
661
- self .execute_model (None )
663
+ self .execute_model (None , True )
662
664
663
665
if int ((self .share_inputs ['seq_lens_this_time' ] > 0 ).sum ()) == 0 :
664
666
break
665
667
666
668
def execute_model (
667
669
self ,
668
670
model_forward_batch : Optional [List [Request ]] = None ,
671
+ is_dummy_run : bool = False ,
669
672
) -> Optional [ModelRunnerOutput ]:
670
673
"""
671
674
The Entrance of model execute.
@@ -721,7 +724,8 @@ class at the server level, which is too granular for ModelRunner.
721
724
accept_num = None ,
722
725
)
723
726
xpu_post_process (sampled_token_ids = sampled_token_ids ,
724
- model_output = model_output_data )
727
+ model_output = model_output_data ,
728
+ skip_save_output = is_dummy_run )
725
729
726
730
# 7. Updata 'infer_seed' and step_paddle()
727
731
self .share_inputs ["infer_seed" ].add_ (self .infer_seed_increment )
0 commit comments