update docs

kevincheng2 · kevincheng2 · commit c7479b159089 · 2025-07-11T14:34:04.000+08:00
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
@@ -330,3 +330,66 @@ ParsedChatCompletionMessage[Info](content='{"addr": "No.1 Century Avenue, Pudong
 Address: No.1 Century Avenue, Pudong New Area, Shanghai
 Height: 468
 ```
+
+### Offline Inference
+
+Offline inference allows restricting the model's output format by pre-specified constraints. In `FastDeploy`, constraints can be specified through the `GuidedDecodingParams` class in `SamplingParams`. `GuidedDecodingParams` supports the following constraint types, with usage similar to online inference:
+
+```python
+json: Optional[Union[str, dict]] = None
+regex: Optional[str] = None
+choice: Optional[List[str]] = None
+grammar: Optional[str] = None
+json_object: Optional[bool] = None
+structural_tag: Optional[str] = None
+```
+
+The following example demonstrates how to use offline inference to generate a structured json:
+
+```python
+from fastdeploy import LLM, SamplingParams
+from fastdeploy.sampling_params import GuidedDecodingParams
+from pydantic import BaseModel
+from enum import Enum
+
+class BookType(str, Enum):
+    romance = "Romance"
+    historical = "Historical"
+    adventure = "Adventure"
+    mystery = "Mystery"
+    dystopian = "Dystopian"
+
+class BookDescription(BaseModel):
+    author: str
+    title: str
+    genre: BookType
+
+# Constrained decoding parameters
+guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())
+
+# Sampling parameters
+sampling_params = SamplingParams(
+    top_p=0.95,
+    max_tokens=6400,
+    guided_decoding=guided_decoding_params,
+)
+
+# Load model
+llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192)
+
+outputs = llm.generate(
+    prompts="Classify this sentiment: vLLM is wonderful!",
+    sampling_params=sampling_params,
+)
+
+# Output results
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs.text
+```
+
+Output:
+
+```
+{"author": "Cao Xueqin", "title": "Dream of the Red Chamber", "genre": "Historical"}
+```
diff --git a/docs/zh/features/structured_outputs.md b/docs/zh/features/structured_outputs.md
@@ -330,3 +330,68 @@ ParsedChatCompletionMessage[Info](content='{"addr": "上海市浦东新区世纪
 地址: 上海市浦东新区世纪大道1号
 高度: 468
 ```
+
+### 离线推理
+
+离线推理允许通过预先指定约束条件，限制模型输出格式。在 `FastDeploy` 中，支持通过 `SamplingParams` 中的 `GuidedDecodingParams` 类指定相关约束条件。`GuidedDecodingParams` 支持以下几种约束条件，使用方式可以参考在线推理：
+
+```python
+json: Optional[Union[str, dict]] = None
+regex: Optional[str] = None
+choice: Optional[List[str]] = None
+grammar: Optional[str] = None
+json_object: Optional[bool] = None
+structural_tag: Optional[str] = None
+```
+
+以下示例展示了如何使用离线推理生成一个结构化的 json :
+
+```python
+
+from fastdeploy import LLM, SamplingParams
+from fastdeploy.sampling_params import GuidedDecodingParams
+from pydantic import BaseModel
+from enum import Enum
+
+class BookType(str, Enum):
+    romance = "Romance"
+    historical = "Historical"
+    adventure = "Adventure"
+    mystery = "Mystery"
+    dystopian = "Dystopian"
+
+class BookDescription(BaseModel):
+    author: str
+    title: str
+    genre: BookType
+
+# 受限解码参数
+guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())
+
+# 采样参数
+sampling_params = SamplingParams(
+    top_p=0.95,
+    max_tokens=6400,
+    guided_decoding=guided_decoding_params,
+)
+
+# 加载模型
+llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192)
+
+outputs = llm.generate(
+    prompts="Classify this sentiment: vLLM is wonderful!",
+    sampling_params=sampling_params,
+)
+
+# 输出结果
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs.text
+
+```
+
+输出
+
+```
+{"author": "曹雪芹", "title": "红楼梦", "genre": "Historical"}
+```
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -67,16 +67,17 @@ def __init__(
         self.speculative_decoding = self.speculative_method is not None
         self.enable_logprob = fd_config.model_config.enable_logprob
 
-        self.guided_backend = None
-        if self.fd_config.parallel_config.guided_decoding_backend != "off":
-            self.guided_backend = get_guided_backend(fd_config=self.fd_config)
-
         #  Sampler
         if not self.speculative_decoding:
             self.sampler = Sampler()
         else:
             self.sampler = SpeculativeSampler(fd_config)
 
+        self.guided_backend = None
+        if self.fd_config.parallel_config.guided_decoding_backend != "off":
+            self.guided_backend = get_guided_backend(fd_config=self.fd_config)
+            self.sampler.set_reasoning_parser(self.guided_backend.get_reasoning_parser())
+
         # Lazy initialize kv cache after model loading
         # self.kv_caches: list[paddle.Tensor] = []