Skip to content

Commit c7479b1

Browse files
committed
update docs
1 parent a0293e9 commit c7479b1

File tree

3 files changed

+133
-4
lines changed

3 files changed

+133
-4
lines changed

docs/features/structured_outputs.md

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,3 +330,66 @@ ParsedChatCompletionMessage[Info](content='{"addr": "No.1 Century Avenue, Pudong
330330
Address: No.1 Century Avenue, Pudong New Area, Shanghai
331331
Height: 468
332332
```
333+
334+
### Offline Inference
335+
336+
Offline inference allows restricting the model's output format by pre-specified constraints. In `FastDeploy`, constraints can be specified through the `GuidedDecodingParams` class in `SamplingParams`. `GuidedDecodingParams` supports the following constraint types, with usage similar to online inference:
337+
338+
```python
339+
json: Optional[Union[str, dict]] = None
340+
regex: Optional[str] = None
341+
choice: Optional[List[str]] = None
342+
grammar: Optional[str] = None
343+
json_object: Optional[bool] = None
344+
structural_tag: Optional[str] = None
345+
```
346+
347+
The following example demonstrates how to use offline inference to generate a structured json:
348+
349+
```python
350+
from fastdeploy import LLM, SamplingParams
351+
from fastdeploy.sampling_params import GuidedDecodingParams
352+
from pydantic import BaseModel
353+
from enum import Enum
354+
355+
class BookType(str, Enum):
356+
romance = "Romance"
357+
historical = "Historical"
358+
adventure = "Adventure"
359+
mystery = "Mystery"
360+
dystopian = "Dystopian"
361+
362+
class BookDescription(BaseModel):
363+
author: str
364+
title: str
365+
genre: BookType
366+
367+
# Constrained decoding parameters
368+
guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())
369+
370+
# Sampling parameters
371+
sampling_params = SamplingParams(
372+
top_p=0.95,
373+
max_tokens=6400,
374+
guided_decoding=guided_decoding_params,
375+
)
376+
377+
# Load model
378+
llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192)
379+
380+
outputs = llm.generate(
381+
prompts="Classify this sentiment: vLLM is wonderful!",
382+
sampling_params=sampling_params,
383+
)
384+
385+
# Output results
386+
for output in outputs:
387+
prompt = output.prompt
388+
generated_text = output.outputs.text
389+
```
390+
391+
Output:
392+
393+
```
394+
{"author": "Cao Xueqin", "title": "Dream of the Red Chamber", "genre": "Historical"}
395+
```

docs/zh/features/structured_outputs.md

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,3 +330,68 @@ ParsedChatCompletionMessage[Info](content='{"addr": "上海市浦东新区世纪
330330
地址: 上海市浦东新区世纪大道1号
331331
高度: 468
332332
```
333+
334+
### 离线推理
335+
336+
离线推理允许通过预先指定约束条件,限制模型输出格式。在 `FastDeploy` 中,支持通过 `SamplingParams` 中的 `GuidedDecodingParams` 类指定相关约束条件。`GuidedDecodingParams` 支持以下几种约束条件,使用方式可以参考在线推理:
337+
338+
```python
339+
json: Optional[Union[str, dict]] = None
340+
regex: Optional[str] = None
341+
choice: Optional[List[str]] = None
342+
grammar: Optional[str] = None
343+
json_object: Optional[bool] = None
344+
structural_tag: Optional[str] = None
345+
```
346+
347+
以下示例展示了如何使用离线推理生成一个结构化的 json :
348+
349+
```python
350+
351+
from fastdeploy import LLM, SamplingParams
352+
from fastdeploy.sampling_params import GuidedDecodingParams
353+
from pydantic import BaseModel
354+
from enum import Enum
355+
356+
class BookType(str, Enum):
357+
romance = "Romance"
358+
historical = "Historical"
359+
adventure = "Adventure"
360+
mystery = "Mystery"
361+
dystopian = "Dystopian"
362+
363+
class BookDescription(BaseModel):
364+
author: str
365+
title: str
366+
genre: BookType
367+
368+
# 受限解码参数
369+
guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())
370+
371+
# 采样参数
372+
sampling_params = SamplingParams(
373+
top_p=0.95,
374+
max_tokens=6400,
375+
guided_decoding=guided_decoding_params,
376+
)
377+
378+
# 加载模型
379+
llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192)
380+
381+
outputs = llm.generate(
382+
prompts="Classify this sentiment: vLLM is wonderful!",
383+
sampling_params=sampling_params,
384+
)
385+
386+
# 输出结果
387+
for output in outputs:
388+
prompt = output.prompt
389+
generated_text = output.outputs.text
390+
391+
```
392+
393+
输出
394+
395+
```
396+
{"author": "曹雪芹", "title": "红楼梦", "genre": "Historical"}
397+
```

fastdeploy/worker/gpu_model_runner.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,16 +67,17 @@ def __init__(
6767
self.speculative_decoding = self.speculative_method is not None
6868
self.enable_logprob = fd_config.model_config.enable_logprob
6969

70-
self.guided_backend = None
71-
if self.fd_config.parallel_config.guided_decoding_backend != "off":
72-
self.guided_backend = get_guided_backend(fd_config=self.fd_config)
73-
7470
# Sampler
7571
if not self.speculative_decoding:
7672
self.sampler = Sampler()
7773
else:
7874
self.sampler = SpeculativeSampler(fd_config)
7975

76+
self.guided_backend = None
77+
if self.fd_config.parallel_config.guided_decoding_backend != "off":
78+
self.guided_backend = get_guided_backend(fd_config=self.fd_config)
79+
self.sampler.set_reasoning_parser(self.guided_backend.get_reasoning_parser())
80+
8081
# Lazy initialize kv cache after model loading
8182
# self.kv_caches: list[paddle.Tensor] = []
8283

0 commit comments

Comments
 (0)