update code

kevincheng2 · kevincheng2 · commit 75f515061d58 · 2025-07-08T21:14:50.000+08:00
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
@@ -348,7 +348,7 @@ The following example demonstrates how to use offline inference to generate a st
 
 ```python
 from fastdeploy import LLM, SamplingParams
-from fastdeploy.sampling_params import GuidedDecodingParams
+from fastdeploy.engine.sampling_params import GuidedDecodingParams
 from pydantic import BaseModel
 from enum import Enum
 
@@ -375,21 +375,20 @@ sampling_params = SamplingParams(
 )
 
 # Load model
-llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192)
+llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")
 
 outputs = llm.generate(
-    prompts="Classify this sentiment: vLLM is wonderful!",
+    prompts="Generate a JSON describing a literary work, including author, title and book type.",
     sampling_params=sampling_params,
 )
 
 # Output results
 for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs.text
+    print(output.outputs.text)
 ```
 
 Output:
 
 ```
-{"author": "Cao Xueqin", "title": "Dream of the Red Chamber", "genre": "Historical"}
+{"author": "George Orwell", "title": "1984", "genre": "Dystopian"}
 ```
diff --git a/docs/zh/features/structured_outputs.md b/docs/zh/features/structured_outputs.md
@@ -349,7 +349,7 @@ structural_tag: Optional[str] = None
 ```python
 
 from fastdeploy import LLM, SamplingParams
-from fastdeploy.sampling_params import GuidedDecodingParams
+from fastdeploy.engine.sampling_params import GuidedDecodingParams
 from pydantic import BaseModel
 from enum import Enum
 
@@ -365,28 +365,27 @@ class BookDescription(BaseModel):
     title: str
     genre: BookType
 
-# 受限解码参数
+# Constrained decoding parameters
 guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())
 
-# 采样参数
+# Sampling parameters
 sampling_params = SamplingParams(
     top_p=0.95,
     max_tokens=6400,
     guided_decoding=guided_decoding_params,
 )
 
-# 加载模型
-llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192)
+# Load model
+llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")
 
 outputs = llm.generate(
-    prompts="Classify this sentiment: vLLM is wonderful!",
+    prompts="生成一个JSON，描述一本中国的著作，要包含作者、标题和书籍类型。",
     sampling_params=sampling_params,
 )
 
-# 输出结果
+# Output results
 for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs.text
+    print(output.outputs.text)
 
 ```
 
diff --git a/fastdeploy/engine/sampling_params.py b/fastdeploy/engine/sampling_params.py
@@ -196,9 +196,6 @@ def _verify_args(self) -> None:
             raise ValueError("seed must be in [0, 922337203685477580], got "
                              f"{self.seed}.")
 
-        if self.guided_decoding is not None:
-            self.guided_decoding._verify_args()
-
     def update_from_tokenizer(self, tokenizer):
         """
         # TODO: Implement stop tokens and bad words support
@@ -245,7 +242,7 @@ def to_dict(self):
                 guided_dict[key] = value
         return guided_dict
 
-    def _verify_args(self):
+    def __post_init__(self):
         """Verify the arguments."""
         guided_count = sum([
             self.json is not None, self.regex is not None, self.choice
diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py
@@ -20,10 +20,9 @@
 from paddleformers.generation import GenerationConfig
 
 from fastdeploy import envs
-from fastdeploy.utils import data_processor_logger
 from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
-
 from fastdeploy.input.text_processor import BaseDataProcessor
+from fastdeploy.utils import data_processor_logger
 
 _SAMPLING_EPS = 1e-5
 
@@ -101,7 +100,7 @@ def process_request(self, request, max_model_len=None, **kwargs):
 
         if request.prompt_token_ids is None or len(
                 request.prompt_token_ids) == 0:
-            system = request.get("system")
+            # system = request.get("system")
             if request.prompt is None and request.messages is None:
                 raise ValueError(
                     f"The request should have `input_ids`, `text` or `messages`: {request}.")
@@ -150,7 +149,7 @@ def process_request_dict(self, request, max_model_len=None):
             request['stop_token_ids'] = stop_seqs
             request['stop_seqs_len'] = stop_seqs_len
 
-        system = request.get("system")
+        # system = request.get("system")
         # 处理prompt_token_ids
         if not request.get('prompt_token_ids'):
             if request.get('prompt') is None and request.get(
@@ -214,7 +213,7 @@ def process_response(self, response_dict, **kwargs):
             response_dict.outputs.reasoning_content = reasoning_content
         else:
             response_dict.outputs.text = full_text
-        data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
+        data_processor_logger.info(f"req_id:{req_id}, token ids: {token_ids}")
         if response_dict.outputs.text == "" and \
                 response_dict.outputs.reasoning_content == "" and \
                 response_dict.outputs.tool_call_content == []:
diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -124,15 +124,14 @@ def apply_token_mask(self,
 
         indices = []
         for idx, processor in self.logits_processor.items():
-            if processor is None:
+            if processor is None or idx in skip_idx_list:
                 continue
             if not processor.enable_reasoning or processor.reasoning_ended:
                 indices.append(idx)
 
-        mask_idx = [i for i in indices if i not in skip_idx_list]
         return available_processors.apply_token_mask(logits,
                                                      self.token_bitmask,
-                                                     indices=mask_idx)
+                                                     indices=indices)
 
     def _accept_token(self, idx: int, token: int):
         """ accept token """
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
@@ -579,10 +579,16 @@ def initialize_fd_config(config_or_args) -> FDConfig:
     Returns:
         FDConfig: Initialized FastDeploy configuration object
     """
-    # Get model config from model directory
-    model_config_dict, _ = ModelConfig.get_config_dict(config_or_args.model_name_or_path)
 
+    def getattr_without_none(obj, attr_name, default=None):
+        if hasattr(obj, attr_name):
+            if getattr(obj, attr_name) == "None":
+                return default
+            return getattr(obj, attr_name)
+        return default
 
+    # Get model config from model directory
+    model_config_dict, _ = ModelConfig.get_config_dict(config_or_args.model_name_or_path)
 
     # Handle MoE related configs
     if 'num_experts' in model_config_dict:
@@ -624,7 +630,7 @@ def initialize_fd_config(config_or_args) -> FDConfig:
 
     # Handle quantization (check for attribute existence)
     model_config.quantization = getattr(config_or_args, 'quantization', None)
-    model_config.reasoning_parser =  getattr(config_or_args, 'reasoning_parser', None)
+    model_config.reasoning_parser = getattr_without_none(config_or_args, 'reasoning_parser', None)
 
     # Update speculative config_or_args
     speculative_config.method = getattr(config_or_args, 'speculative_method', None)