diff --git a/fastdeploy/demo/offline_demo.py b/fastdeploy/demo/offline_demo.py index 856757aa00..7e56799ee6 100644 --- a/fastdeploy/demo/offline_demo.py +++ b/fastdeploy/demo/offline_demo.py @@ -17,13 +17,14 @@ from fastdeploy.engine.sampling_params import SamplingParams from fastdeploy.entrypoints.llm import LLM -model_name_or_path = "./models/llama-7b" +model_name_or_path = "/root/.paddlenlp/models/Qwen/Qwen3-8B" +# model_name_or_path = "/home/zexuli/Models/Qwen3-0.6B" -# 超参设置 -sampling_params = SamplingParams(temperature=0.1, max_tokens=30) -llm = LLM(model=model_name_or_path, tensor_parallel_size=1) -output = llm.generate(prompts="who are you?", - use_tqdm=True, - sampling_params=sampling_params) - -print(output) +sampling_params = SamplingParams(temperature=0.1) +llm = LLM(model=model_name_or_path, tensor_parallel_size=2,reasoning_parser="qwen3") +prompt = "北京天安门在哪里?" +messages = [{"role": "user", "content": prompt}] +output = llm.chat([messages], + sampling_params) + +print(output) \ No newline at end of file diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 8ef8a5149c..2410f5083c 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -72,7 +72,7 @@ # Set attention backend. "NATIVE_ATTN", "APPEND_ATTN" # and "MLA_ATTN" can be set currently. "FD_ATTENTION_BACKEND": - lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN"), + lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN,NATIVE_ATTN").split(","), # Set sampling class. "base", "air" and "rejection" can be set currently. "FD_SAMPLING_CLASS": diff --git a/fastdeploy/model_executor/layers/attention/attention_selecter.py b/fastdeploy/model_executor/layers/attention/attention_selecter.py index 3db03b188e..29c183a155 100644 --- a/fastdeploy/model_executor/layers/attention/attention_selecter.py +++ b/fastdeploy/model_executor/layers/attention/attention_selecter.py @@ -34,7 +34,7 @@ def _get_attn_backend(selected_backend: str) -> object: selected_backend = backend_name_to_enum(selected_backend) attention_cls = current_platform.get_attention_backend_cls( selected_backend) - + print("attention_cls",attention_cls) if not attention_cls: raise ValueError( f"Invalid attention backend for {current_platform.device_name}") @@ -43,5 +43,8 @@ def _get_attn_backend(selected_backend: str) -> object: def get_attention_backend() -> object: """Selects which attention backend.""" - attention_backend = envs.FD_ATTENTION_BACKEND - return _get_attn_backend(attention_backend) + attention_backend,native_attention_backend = envs.FD_ATTENTION_BACKEND + if current_platform.is_cuda(): + return _get_attn_backend(attention_backend) + elif current_platform.is_cpu(): + return _get_attn_backend(native_attention_backend) diff --git a/fastdeploy/model_executor/layers/embeddings.py b/fastdeploy/model_executor/layers/embeddings.py index bc67cb1333..3a750eb5c4 100644 --- a/fastdeploy/model_executor/layers/embeddings.py +++ b/fastdeploy/model_executor/layers/embeddings.py @@ -122,6 +122,7 @@ def load_state_dict(self, state_dict: Dict[str, Args: state_dict (dict): A dictionary containing the checkpoint weights and biases. """ + if self.tie_word_embeddings: self.word_embeddings.weight.set_value( get_tensor(state_dict[self.prefix + ".weight"]).astype( @@ -131,6 +132,7 @@ def load_state_dict(self, state_dict: Dict[str, get_tensor(state_dict.pop(self.prefix + ".weight")).astype( paddle.get_default_dtype())) + def forward(self, ids_remove_padding=None) -> paddle.Tensor: """ Defines the forward computation of the layer. diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 8d6ca79a1b..c8942f3ba2 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -207,7 +207,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request]): self.share_inputs["input_ids"][idx:idx + 1, :length] = np.array( request.prompt_token_ids) - # Use chunked prefill if self.parallel_config.enable_chunked_prefill: request.set("chunk_idx", 1) @@ -714,6 +713,7 @@ def initialize_attn_backend(self) -> None: # Get the attention backend attn_cls = get_attention_backend() + attn_backend = attn_cls(self.fd_config, kv_num_heads=self.model_config.kv_num_heads, num_heads=num_heads, diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index fef56089ba..c4cf13cab7 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -596,7 +596,7 @@ def initialize_fd_config(config_or_args) -> FDConfig: model_config.head_dim = model_config_dict["head_dim"] paddle.set_default_dtype(config_or_args.dtype) if 'tie_word_embeddings' in model_config_dict: - model_config_dict['tie_word_embeddings'] = model_config_dict.pop('tie_word_embeddings') + model_config.tie_word_embeddings = model_config_dict['tie_word_embeddings'] # Initialize all config components device_config = DeviceConfig() diff --git a/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py b/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py index 092b1282f3..704884ee6c 100644 --- a/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py +++ b/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py @@ -23,7 +23,7 @@ # Read ports from environment variables; use default values if not set -FD_API_PORT = int(os.getenv("FD_API_PORT", 8188)) +FD_API_PORT = int(os.getenv("FD_API_PORT", 8781)) FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133)) FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233)) diff --git a/test/layers/test_append_attention.py b/test/layers/test_append_attention.py index 2b23566efb..abc05dbaeb 100644 --- a/test/layers/test_append_attention.py +++ b/test/layers/test_append_attention.py @@ -80,6 +80,7 @@ def _apply_rope(self, rotary_emb, q, k, v=None, causal=False): # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1] if self.use_neox_rotary_style: + print("use_neox_rotary_style也有?") sin_pos = sin cos_pos = cos # NeoX Stype:前后半部分分块旋转 @@ -92,7 +93,7 @@ def _apply_rope(self, rotary_emb, q, k, v=None, causal=False): paddle.shape(k), ) else: - # import pdb;pdb.set_trace() + print("跑的这里嘛") sin_pos = paddle.reshape(paddle.stack( [sin, sin], axis=-1), [1, 1, seq, head_dim]) # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]