PaddlePaddle · lizexu123 · Jul 7, 2025 · Jul 7, 2025 · Jul 8, 2025 · Jul 8, 2025
diff --git a/fastdeploy/demo/offline_demo.py b/fastdeploy/demo/offline_demo.py
@@ -17,13 +17,14 @@
 from fastdeploy.engine.sampling_params import SamplingParams
 from fastdeploy.entrypoints.llm import LLM
 
-model_name_or_path = "./models/llama-7b"
+model_name_or_path = "/root/.paddlenlp/models/Qwen/Qwen3-8B"
+# model_name_or_path = "/home/zexuli/Models/Qwen3-0.6B"
 
-# 超参设置
-sampling_params = SamplingParams(temperature=0.1, max_tokens=30)
-llm = LLM(model=model_name_or_path, tensor_parallel_size=1)
-output = llm.generate(prompts="who are you？",
-                      use_tqdm=True,
-                      sampling_params=sampling_params)
-
-print(output)
+sampling_params = SamplingParams(temperature=0.1)
+llm = LLM(model=model_name_or_path, tensor_parallel_size=2,reasoning_parser="qwen3")
+prompt = "北京天安门在哪里?"
+messages = [{"role": "user", "content": prompt}]
+output = llm.chat([messages],
+                   sampling_params)
+              
+print(output) 
diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
@@ -72,7 +72,7 @@
     # Set attention backend. "NATIVE_ATTN", "APPEND_ATTN"
     # and "MLA_ATTN" can be set currently.
     "FD_ATTENTION_BACKEND":
-    lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN"),
+    lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN,NATIVE_ATTN").split(","),
 
     # Set sampling class. "base", "air" and "rejection" can be set currently.
     "FD_SAMPLING_CLASS":

diff --git a/fastdeploy/model_executor/layers/attention/attention_selecter.py b/fastdeploy/model_executor/layers/attention/attention_selecter.py
@@ -34,7 +34,7 @@ def _get_attn_backend(selected_backend: str) -> object:
         selected_backend = backend_name_to_enum(selected_backend)
     attention_cls = current_platform.get_attention_backend_cls(
         selected_backend)
-
+    print("attention_cls",attention_cls)
     if not attention_cls:
         raise ValueError(
             f"Invalid attention backend for {current_platform.device_name}")
@@ -43,5 +43,8 @@ def _get_attn_backend(selected_backend: str) -> object:
 
 def get_attention_backend() -> object:
     """Selects which attention backend."""
-    attention_backend = envs.FD_ATTENTION_BACKEND
-    return _get_attn_backend(attention_backend)
+    attention_backend,native_attention_backend = envs.FD_ATTENTION_BACKEND
+    if current_platform.is_cuda():
+        return _get_attn_backend(attention_backend)
+    elif current_platform.is_cpu():
+        return _get_attn_backend(native_attention_backend)
diff --git a/fastdeploy/model_executor/layers/embeddings.py b/fastdeploy/model_executor/layers/embeddings.py
@@ -122,6 +122,7 @@ def load_state_dict(self, state_dict: Dict[str,
         Args:
             state_dict (dict): A dictionary containing the checkpoint weights and biases.
         """
+
         if self.tie_word_embeddings:
             self.word_embeddings.weight.set_value(
                 get_tensor(state_dict[self.prefix + ".weight"]).astype(
@@ -131,6 +132,7 @@ def load_state_dict(self, state_dict: Dict[str,
                 get_tensor(state_dict.pop(self.prefix + ".weight")).astype(
                     paddle.get_default_dtype()))
 
+
     def forward(self, ids_remove_padding=None) -> paddle.Tensor:
         """
         Defines the forward computation of the layer.

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -207,7 +207,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request]):
                 self.share_inputs["input_ids"][idx:idx +
                                                1, :length] = np.array(
                                                    request.prompt_token_ids)
-
                 # Use chunked prefill
                 if self.parallel_config.enable_chunked_prefill:
                     request.set("chunk_idx", 1)
@@ -714,6 +713,7 @@ def initialize_attn_backend(self) -> None:
 
         # Get the attention backend
         attn_cls = get_attention_backend()
+
         attn_backend = attn_cls(self.fd_config,
                                 kv_num_heads=self.model_config.kv_num_heads,
                                 num_heads=num_heads,

diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
@@ -596,7 +596,7 @@ def initialize_fd_config(config_or_args) -> FDConfig:
     model_config.head_dim = model_config_dict["head_dim"]
     paddle.set_default_dtype(config_or_args.dtype)
     if 'tie_word_embeddings' in model_config_dict:
-        model_config_dict['tie_word_embeddings'] = model_config_dict.pop('tie_word_embeddings')
+        model_config.tie_word_embeddings = model_config_dict['tie_word_embeddings']
 
     # Initialize all config components
     device_config = DeviceConfig()

diff --git a/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py b/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py
@@ -23,7 +23,7 @@
 
 
 # Read ports from environment variables; use default values if not set
-FD_API_PORT = int(os.getenv("FD_API_PORT", 8188))
+FD_API_PORT = int(os.getenv("FD_API_PORT", 8781))
 FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133))
 FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233))
 

diff --git a/test/layers/test_append_attention.py b/test/layers/test_append_attention.py
@@ -80,6 +80,7 @@ def _apply_rope(self, rotary_emb, q, k, v=None, causal=False):
         # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
 
         if self.use_neox_rotary_style:
+            print("use_neox_rotary_style也有?")
             sin_pos = sin
             cos_pos = cos
             # NeoX Stype：前后半部分分块旋转
@@ -92,7 +93,7 @@ def _apply_rope(self, rotary_emb, q, k, v=None, causal=False):
                 paddle.shape(k),
             )
         else:
-            # import pdb;pdb.set_trace()
+            print("跑的这里嘛")
             sin_pos = paddle.reshape(paddle.stack(
             [sin, sin], axis=-1), [1, 1, seq, head_dim])
             # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]