update code

kevincheng2 · kevincheng2 · commit 72de4a38c3c7 · 2025-07-11T14:40:51.000+08:00
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
@@ -348,7 +348,7 @@ The following example demonstrates how to use offline inference to generate a st
 
 ```python
 from fastdeploy import LLM, SamplingParams
-from fastdeploy.sampling_params import GuidedDecodingParams
+from fastdeploy.engine.sampling_params import GuidedDecodingParams
 from pydantic import BaseModel
 from enum import Enum
 
@@ -375,21 +375,20 @@ sampling_params = SamplingParams(
 )
 
 # Load model
-llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192)
+llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")
 
 outputs = llm.generate(
-    prompts="Classify this sentiment: vLLM is wonderful!",
+    prompts="Generate a JSON describing a literary work, including author, title and book type.",
     sampling_params=sampling_params,
 )
 
 # Output results
 for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs.text
+    print(output.outputs.text)
 ```
 
 Output:
 
 ```
-{"author": "Cao Xueqin", "title": "Dream of the Red Chamber", "genre": "Historical"}
+{"author": "George Orwell", "title": "1984", "genre": "Dystopian"}
 ```
diff --git a/docs/zh/features/structured_outputs.md b/docs/zh/features/structured_outputs.md
@@ -349,7 +349,7 @@ structural_tag: Optional[str] = None
 ```python
 
 from fastdeploy import LLM, SamplingParams
-from fastdeploy.sampling_params import GuidedDecodingParams
+from fastdeploy.engine.sampling_params import GuidedDecodingParams
 from pydantic import BaseModel
 from enum import Enum
 
@@ -365,28 +365,27 @@ class BookDescription(BaseModel):
     title: str
     genre: BookType
 
-# 受限解码参数
+# Constrained decoding parameters
 guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())
 
-# 采样参数
+# Sampling parameters
 sampling_params = SamplingParams(
     top_p=0.95,
     max_tokens=6400,
     guided_decoding=guided_decoding_params,
 )
 
-# 加载模型
-llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192)
+# Load model
+llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")
 
 outputs = llm.generate(
-    prompts="Classify this sentiment: vLLM is wonderful!",
+    prompts="生成一个JSON，描述一本中国的著作，要包含作者、标题和书籍类型。",
     sampling_params=sampling_params,
 )
 
-# 输出结果
+# Output results
 for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs.text
+    print(output.outputs.text)
 
 ```
 
diff --git a/fastdeploy/engine/sampling_params.py b/fastdeploy/engine/sampling_params.py
@@ -210,9 +210,6 @@ def _verify_args(self) -> None:
             raise ValueError("seed must be in [0, 922337203685477580], got "
                              f"{self.seed}.")
 
-        if self.guided_decoding is not None:
-            self.guided_decoding._verify_args()
-
     def update_from_tokenizer(self, tokenizer):
         """
         # TODO: Implement stop tokens and bad words support
@@ -259,7 +256,7 @@ def to_dict(self):
                 guided_dict[key] = value
         return guided_dict
 
-    def _verify_args(self):
+    def __post_init__(self):
         """Verify the arguments."""
         guided_count = sum([
             self.json is not None, self.regex is not None, self.choice
diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py
@@ -100,7 +100,7 @@ def process_request(self, request, max_model_len=None, **kwargs):
 
         if request.prompt_token_ids is None or len(
                 request.prompt_token_ids) == 0:
-            system = request.get("system")
+            # system = request.get("system")
             if request.prompt is None and request.messages is None:
                 raise ValueError(
                     f"The request should have `input_ids`, `text` or `messages`: {request}.")
@@ -149,7 +149,7 @@ def process_request_dict(self, request, max_model_len=None):
             request['stop_token_ids'] = stop_seqs
             request['stop_seqs_len'] = stop_seqs_len
 
-        system = request.get("system")
+        # system = request.get("system")
         # 处理prompt_token_ids
         if not request.get('prompt_token_ids'):
             if request.get('prompt') is None and request.get(
@@ -213,7 +213,7 @@ def process_response(self, response_dict, **kwargs):
             response_dict.outputs.reasoning_content = reasoning_content
         else:
             response_dict.outputs.text = full_text
-        data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
+        data_processor_logger.info(f"req_id:{req_id}, token ids: {token_ids}")
         if response_dict.outputs.text == "" and \
                 response_dict.outputs.reasoning_content == "":
             return None
diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -125,15 +125,14 @@ def apply_token_mask(self,
 
         indices = []
         for idx, processor in self.logits_processor.items():
-            if processor is None:
+            if processor is None or idx in skip_idx_list:
                 continue
             if not processor.enable_reasoning or processor.reasoning_ended:
                 indices.append(idx)
 
-        mask_idx = [i for i in indices if i not in skip_idx_list]
         return available_processors.apply_token_mask(logits,
                                                      self.token_bitmask,
-                                                     indices=mask_idx)
+                                                     indices=indices)
 
     def _accept_token(self, idx: int, token: int):
         """ accept token """
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -1024,7 +1024,7 @@ class at the server level, which is too granular for ModelRunner.
             if self.parallel_config.tensor_parallel_degree > 1:
                 paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
 
-            self.sampler.post_process(sampled_token_ids, skip_idx_list)
+            self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list)
         else:
             self.sampler(logits, self.sampling_metadata,
                          self.parallel_config.max_model_len, self.share_inputs)
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
@@ -606,7 +606,10 @@ def getattr_without_none(obj, attr_name, default=None):
     # Get model config from model directory
     model_config_dict, _ = ModelConfig.get_config_dict(config_or_args.model_name_or_path)
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> 75f51506 (update code)
     # Handle MoE related configs
     if 'num_experts' in model_config_dict:
         model_config_dict['moe_num_experts'] = model_config_dict.pop('num_experts')