Skip to content

Commit bb88003

Browse files
committed
support stop_reqs
1 parent 050d965 commit bb88003

File tree

10 files changed

+80
-26
lines changed

10 files changed

+80
-26
lines changed

docs/usage/environment_variables.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
3232
"FD_STOP_SEQS_MAX_LEN":
3333
lambda: os.getenv("FD_STOP_SEQS_MAX_LEN", "8"),
3434

35+
# Whether to use stop sequences (0 or 1)
36+
"FD_USE_STOP_SEQ":
37+
lambda: os.getenv("FD_USE_STOP_SEQ", 0),
38+
3539
# GPU devices to use (comma-separated string, e.g. 0,1,2)
3640
"CUDA_VISIBLE_DEVICES":
3741
lambda: os.getenv("CUDA_VISIBLE_DEVICES", None),
@@ -67,6 +71,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
6771
# Switch from standalone PD to centralized inference (0 or 1)
6872
"FD_PD_CHANGEABLE":
6973
lambda: os.getenv("FD_PD_CHANGEABLE", "1"),
70-
74+
7175
}
72-
```
76+
```

docs/zh/usage/environment_variables.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# FastDeploy 环境变量说明
22
FastDeploy 的环境变量保存在了代码库根目录下 fastdeploy/envs.py 文件中,以下是其对应的中文版说明:
3+
34
```python
45
environment_variables: dict[str, Callable[[], Any]] = {
56
# 构建 FastDeploy 时使用的 CUDA 架构版本,这是一个字符串列表,例如[80,90]
@@ -30,6 +31,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
3031
"FD_STOP_SEQS_MAX_LEN":
3132
lambda: os.getenv("FD_STOP_SEQS_MAX_LEN", "8"),
3233

34+
# 是否使用停止序列
35+
"FD_USE_STOP_SEQ":
36+
lambda: os.getenv("FD_USE_STOP_SEQ", 0),
37+
3338
# 将要使用的GPU设备,这是一个用逗号分隔的字符串,例如 0,1,2
3439
"CUDA_VISIBLE_DEVICES":
3540
lambda: os.getenv("CUDA_VISIBLE_DEVICES", None),
@@ -65,6 +70,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
6570
# 是否从单机 PD 分离转换为集中式推理
6671
"FD_PD_CHANGEABLE":
6772
lambda: os.getenv("FD_PD_CHANGEABLE", "1"),
68-
73+
6974
}
70-
```
75+
```

fastdeploy/config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
from paddleformers.transformers.configuration_utils import PretrainedConfig
2424

25+
from fastdeploy import envs
2526
from fastdeploy.model_executor.layers.quantization.quant_base import \
2627
QuantConfigBase
2728
from fastdeploy.utils import get_logger
@@ -124,6 +125,9 @@ def __init__(
124125
self.tie_word_embeddings = tie_word_embeddings
125126
self.is_quantized = is_quantized
126127

128+
self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
129+
self.stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
130+
self.use_stop_seq = int(envs.FD_USE_STOP_SEQ)
127131

128132
@dataclass
129133
class MoEConfig:

fastdeploy/engine/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ def read_from_env(self):
126126
"""
127127
self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
128128
self.stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
129+
self.use_stop_seq = int(envs.FD_USE_STOP_SEQ)
129130

130131
def reset_config_value(key, value):
131132
if not hasattr(self, key.lower()):

fastdeploy/engine/sampling_params.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ class SamplingParams:
8585
seed: Optional[int] = None
8686
stop: Optional[Union[str, List[str]]] = None
8787
stop_token_ids: Optional[Union[List[List[int]], List[int]]] = None
88+
stop_seqs_len: Optional[int] = None
8889
max_tokens: Optional[int] = None
8990
reasoning_max_tokens: Optional[int] = None
9091
min_tokens: int = 1

fastdeploy/envs.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@
5252
"FD_STOP_SEQS_MAX_LEN":
5353
lambda: os.getenv("FD_STOP_SEQS_MAX_LEN", "8"),
5454

55+
# Whether to use stop sequences (0 or 1)
56+
"FD_USE_STOP_SEQ":
57+
lambda: os.getenv("FD_USE_STOP_SEQ", "0"),
58+
5559
# GPU devices that will be used. This is a string that
5660
# splited by comma, such as 0,1,2.
5761
"CUDA_VISIBLE_DEVICES":

fastdeploy/model_executor/pre_and_post_process.py

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,12 @@
2121
from fastdeploy.engine.config import SpeculativeConfig
2222
from fastdeploy.model_executor.ops.gpu import (
2323
get_padding_offset, save_output, set_stop_value_multi_ends,
24-
speculate_clear_accept_nums, speculate_get_output_padding_offset,
25-
speculate_get_padding_offset, speculate_get_seq_lens_output,
26-
speculate_save_output, speculate_set_value_by_flags_and_idx,
27-
speculate_step_paddle, speculate_step_system_cache, speculate_update_v3,
28-
step_paddle, step_system_cache, update_inputs, step_reschedule)
24+
set_stop_value_multi_seqs, speculate_clear_accept_nums,
25+
speculate_get_output_padding_offset, speculate_get_padding_offset,
26+
speculate_get_seq_lens_output, speculate_save_output,
27+
speculate_set_value_by_flags_and_idx, speculate_step_paddle,
28+
speculate_step_system_cache, speculate_update_v3, step_paddle,
29+
step_reschedule, step_system_cache, update_inputs)
2930
from fastdeploy.platforms import current_platform
3031
from fastdeploy.worker.output import ModelOutputData
3132

@@ -105,7 +106,8 @@ def pre_process(
105106
def post_process_normal(sampled_token_ids: paddle.Tensor,
106107
model_output: ModelOutputData,
107108
save_each_rank: bool = False,
108-
skip_save_output: bool = False) -> None:
109+
skip_save_output: bool = False,
110+
use_stop_seqs: bool = False) -> None:
109111
""" Post-processing steps after completing a single token generation. """
110112
# 1. Set stop value
111113
paddle.assign(
@@ -122,12 +124,23 @@ def post_process_normal(sampled_token_ids: paddle.Tensor,
122124
paddle.logical_or(model_output.stop_flags, length_cond),
123125
model_output.stop_flags,
124126
)
125-
# TODO(gongshaotian): Add use_stop_seqs
126-
set_stop_value_multi_ends(sampled_token_ids, model_output.stop_flags,
127-
model_output.seq_lens_this_time,
128-
model_output.eos_token_id,
129-
model_output.next_tokens, False) # multi ends
130127

128+
if not use_stop_seqs:
129+
set_stop_value_multi_ends(sampled_token_ids, model_output.stop_flags,
130+
model_output.seq_lens_this_time,
131+
model_output.eos_token_id,
132+
model_output.next_tokens, False) # multi ends
133+
else:
134+
set_stop_value_multi_seqs(
135+
sampled_token_ids,
136+
model_output.pre_ids,
137+
model_output.step_idx,
138+
model_output.stop_flags,
139+
model_output.seq_lens_this_time,
140+
model_output.stop_token_ids,
141+
model_output.stop_seqs_len,
142+
model_output.eos_token_id,
143+
)
131144
# 2. Update the input buffer of the model
132145
with paddle.framework._no_check_dy2st_diff():
133146
update_inputs(
@@ -197,13 +210,14 @@ def post_process(sampled_token_ids: paddle.Tensor,
197210
model_output: ModelOutputData,
198211
save_each_rank: bool = False,
199212
speculative_decoding: bool = False,
200-
skip_save_output: bool = False) -> None:
213+
skip_save_output: bool = False,
214+
use_stop_seq: bool = False) -> None:
201215
""" Post-processing steps after completing a single token generation. """
202216
if speculative_decoding:
203217
post_process_specualate(model_output, skip_save_output)
204218
else:
205219
post_process_normal(sampled_token_ids, model_output, save_each_rank,
206-
skip_save_output)
220+
skip_save_output, use_stop_seq)
207221

208222

209223
def step_cuda(
@@ -217,7 +231,7 @@ def step_cuda(
217231
TODO(gongshaotian): normalization name
218232
"""
219233

220-
234+
221235
if speculative_config.method is not None:
222236
if enable_prefix_caching:
223237
speculate_step_system_cache(

fastdeploy/worker/gpu_model_runner.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -280,9 +280,9 @@ def insert_prefill_inputs(self, req_dicts: List[Request]):
280280
stop_seqs_num = len(request.get("stop_seqs_len"))
281281
for i in range(stop_seqs_num,
282282
self.model_config.max_stop_seqs_num):
283-
request.stop_seqs_len.append(0)
283+
request.sampling_params.stop_seqs_len.append(0)
284284
self.share_inputs["stop_seqs_len"][:] = np.array(
285-
request.stop_seqs_len, dtype="int32")
285+
request.sampling_params.stop_seqs_len, dtype="int32")
286286
self.share_inputs["stop_seqs"][:stop_seqs_num, :len(
287287
request.get("stop_token_ids")[0])] = np.array(
288288
request.get("stop_token_ids"), dtype="int64")
@@ -505,7 +505,7 @@ def _init_share_inputs(self, max_num_seqs: int):
505505
self.model_config.stop_seqs_max_len
506506
],
507507
-1,
508-
dtype="int32")
508+
dtype="int64")
509509
if self.speculative_decoding:
510510
max_draft_token_num = self.speculative_config.num_speculative_tokens
511511
self.share_inputs["input_ids_cpu"] = paddle.full(
@@ -832,7 +832,11 @@ def _dummy_run(self,
832832
accept_tokens=self.share_inputs["accept_tokens"]
833833
if self.speculative_decoding else None,
834834
accept_num=self.share_inputs["accept_num"]
835-
if self.speculative_decoding else None)
835+
if self.speculative_decoding else None,
836+
stop_token_ids=self.share_inputs["stop_seqs"]
837+
if self.model_config.use_stop_seq else None,
838+
stop_seqs_len=self.share_inputs["stop_seqs_len"]
839+
if self.model_config.use_stop_seq else None)
836840

837841
post_process(sampled_token_ids=sampled_token_ids,
838842
model_output=model_output_data,
@@ -1065,7 +1069,12 @@ class at the server level, which is too granular for ModelRunner.
10651069
accept_tokens=self.share_inputs["accept_tokens"]
10661070
if self.speculative_decoding else None,
10671071
accept_num=self.share_inputs["accept_num"]
1068-
if self.speculative_decoding else None)
1072+
if self.speculative_decoding else None,
1073+
stop_token_ids=self.share_inputs["stop_seqs"]
1074+
if self.model_config.use_stop_seq else None,
1075+
stop_seqs_len=self.share_inputs["stop_seqs_len"]
1076+
if self.model_config.use_stop_seq else None,
1077+
)
10691078

10701079
if self.speculative_config.method in ["mtp"] and \
10711080
self.parallel_config.splitwise_role == "prefill":
@@ -1076,7 +1085,8 @@ class at the server level, which is too granular for ModelRunner.
10761085
model_output=model_output_data,
10771086
save_each_rank=self.parallel_config.use_ep,
10781087
speculative_decoding=self.speculative_decoding,
1079-
skip_save_output=skip_save_output)
1088+
skip_save_output=skip_save_output,
1089+
use_stop_seq=self.model_config.use_stop_seq)
10801090

10811091
# 6. Speculative decode
10821092
if self.speculative_decoding:

fastdeploy/worker/output.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,15 @@ class ModelOutputData:
132132
"""
133133
accept_num: paddle.Tensor
134134

135+
"""
136+
the token ids of stop sequence
137+
"""
138+
stop_token_ids: paddle.Tensor
139+
140+
"""
141+
the length of stop sequence
142+
"""
143+
stop_seqs_len: paddle.Tensor
135144

136145
@dataclass
137146
class ModelRunnerOutput:

fastdeploy/worker/xpu_model_runner.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -320,9 +320,9 @@ def process_prefill_inputs(self, req_dicts: List[Request]):
320320
stop_seqs_num = len(request.get("stop_seqs_len"))
321321
for i in range(stop_seqs_num,
322322
self.model_config.max_stop_seqs_num):
323-
request.stop_seqs_len.append(0)
323+
request.sampling_params.stop_seqs_len.append(0)
324324
self.share_inputs["stop_seqs_len"][:] = np.array(
325-
request.stop_seqs_len, dtype="int32")
325+
request.sampling_params.stop_seqs_len, dtype="int32")
326326
self.share_inputs["stop_seqs"][:stop_seqs_num, :len(
327327
request.get("stop_token_ids")[0])] = np.array(
328328
request.get("stop_token_ids"), dtype="int64")
@@ -719,6 +719,8 @@ class at the server level, which is too granular for ModelRunner.
719719
actual_draft_token_num=None,
720720
accept_tokens=None,
721721
accept_num=None,
722+
stop_token_ids=None,
723+
stop_seqs_len=None,
722724
)
723725
xpu_post_process(sampled_token_ids=sampled_token_ids,
724726
model_output=model_output_data)

0 commit comments

Comments
 (0)