Skip to content

Commit 72de4a3

Browse files
committed
update code
1 parent c7479b1 commit 72de4a3

File tree

7 files changed

+23
-26
lines changed

7 files changed

+23
-26
lines changed

docs/features/structured_outputs.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ The following example demonstrates how to use offline inference to generate a st
348348

349349
```python
350350
from fastdeploy import LLM, SamplingParams
351-
from fastdeploy.sampling_params import GuidedDecodingParams
351+
from fastdeploy.engine.sampling_params import GuidedDecodingParams
352352
from pydantic import BaseModel
353353
from enum import Enum
354354

@@ -375,21 +375,20 @@ sampling_params = SamplingParams(
375375
)
376376

377377
# Load model
378-
llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192)
378+
llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")
379379

380380
outputs = llm.generate(
381-
prompts="Classify this sentiment: vLLM is wonderful!",
381+
prompts="Generate a JSON describing a literary work, including author, title and book type.",
382382
sampling_params=sampling_params,
383383
)
384384

385385
# Output results
386386
for output in outputs:
387-
prompt = output.prompt
388-
generated_text = output.outputs.text
387+
print(output.outputs.text)
389388
```
390389

391390
Output:
392391

393392
```
394-
{"author": "Cao Xueqin", "title": "Dream of the Red Chamber", "genre": "Historical"}
393+
{"author": "George Orwell", "title": "1984", "genre": "Dystopian"}
395394
```

docs/zh/features/structured_outputs.md

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,7 @@ structural_tag: Optional[str] = None
349349
```python
350350

351351
from fastdeploy import LLM, SamplingParams
352-
from fastdeploy.sampling_params import GuidedDecodingParams
352+
from fastdeploy.engine.sampling_params import GuidedDecodingParams
353353
from pydantic import BaseModel
354354
from enum import Enum
355355

@@ -365,28 +365,27 @@ class BookDescription(BaseModel):
365365
title: str
366366
genre: BookType
367367

368-
# 受限解码参数
368+
# Constrained decoding parameters
369369
guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())
370370

371-
# 采样参数
371+
# Sampling parameters
372372
sampling_params = SamplingParams(
373373
top_p=0.95,
374374
max_tokens=6400,
375375
guided_decoding=guided_decoding_params,
376376
)
377377

378-
# 加载模型
379-
llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192)
378+
# Load model
379+
llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")
380380

381381
outputs = llm.generate(
382-
prompts="Classify this sentiment: vLLM is wonderful!",
382+
prompts="生成一个JSON,描述一本中国的著作,要包含作者、标题和书籍类型。",
383383
sampling_params=sampling_params,
384384
)
385385

386-
# 输出结果
386+
# Output results
387387
for output in outputs:
388-
prompt = output.prompt
389-
generated_text = output.outputs.text
388+
print(output.outputs.text)
390389

391390
```
392391

fastdeploy/engine/sampling_params.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -210,9 +210,6 @@ def _verify_args(self) -> None:
210210
raise ValueError("seed must be in [0, 922337203685477580], got "
211211
f"{self.seed}.")
212212

213-
if self.guided_decoding is not None:
214-
self.guided_decoding._verify_args()
215-
216213
def update_from_tokenizer(self, tokenizer):
217214
"""
218215
# TODO: Implement stop tokens and bad words support
@@ -259,7 +256,7 @@ def to_dict(self):
259256
guided_dict[key] = value
260257
return guided_dict
261258

262-
def _verify_args(self):
259+
def __post_init__(self):
263260
"""Verify the arguments."""
264261
guided_count = sum([
265262
self.json is not None, self.regex is not None, self.choice

fastdeploy/input/ernie_processor.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def process_request(self, request, max_model_len=None, **kwargs):
100100

101101
if request.prompt_token_ids is None or len(
102102
request.prompt_token_ids) == 0:
103-
system = request.get("system")
103+
# system = request.get("system")
104104
if request.prompt is None and request.messages is None:
105105
raise ValueError(
106106
f"The request should have `input_ids`, `text` or `messages`: {request}.")
@@ -149,7 +149,7 @@ def process_request_dict(self, request, max_model_len=None):
149149
request['stop_token_ids'] = stop_seqs
150150
request['stop_seqs_len'] = stop_seqs_len
151151

152-
system = request.get("system")
152+
# system = request.get("system")
153153
# 处理prompt_token_ids
154154
if not request.get('prompt_token_ids'):
155155
if request.get('prompt') is None and request.get(
@@ -213,7 +213,7 @@ def process_response(self, response_dict, **kwargs):
213213
response_dict.outputs.reasoning_content = reasoning_content
214214
else:
215215
response_dict.outputs.text = full_text
216-
data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
216+
data_processor_logger.info(f"req_id:{req_id}, token ids: {token_ids}")
217217
if response_dict.outputs.text == "" and \
218218
response_dict.outputs.reasoning_content == "":
219219
return None

fastdeploy/model_executor/layers/sample/sampler.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,15 +125,14 @@ def apply_token_mask(self,
125125

126126
indices = []
127127
for idx, processor in self.logits_processor.items():
128-
if processor is None:
128+
if processor is None or idx in skip_idx_list:
129129
continue
130130
if not processor.enable_reasoning or processor.reasoning_ended:
131131
indices.append(idx)
132132

133-
mask_idx = [i for i in indices if i not in skip_idx_list]
134133
return available_processors.apply_token_mask(logits,
135134
self.token_bitmask,
136-
indices=mask_idx)
135+
indices=indices)
137136

138137
def _accept_token(self, idx: int, token: int):
139138
""" accept token """

fastdeploy/worker/gpu_model_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1024,7 +1024,7 @@ class at the server level, which is too granular for ModelRunner.
10241024
if self.parallel_config.tensor_parallel_degree > 1:
10251025
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
10261026

1027-
self.sampler.post_process(sampled_token_ids, skip_idx_list)
1027+
self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list)
10281028
else:
10291029
self.sampler(logits, self.sampling_metadata,
10301030
self.parallel_config.max_model_len, self.share_inputs)

fastdeploy/worker/worker_process.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -606,7 +606,10 @@ def getattr_without_none(obj, attr_name, default=None):
606606
# Get model config from model directory
607607
model_config_dict, _ = ModelConfig.get_config_dict(config_or_args.model_name_or_path)
608608

609+
<<<<<<< HEAD
609610

611+
=======
612+
>>>>>>> 75f51506 (update code)
610613
# Handle MoE related configs
611614
if 'num_experts' in model_config_dict:
612615
model_config_dict['moe_num_experts'] = model_config_dict.pop('num_experts')

0 commit comments

Comments
 (0)