Skip to content

Fixed the garbled text issues in Qwen3-8B #2782

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions fastdeploy/demo/offline_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@
from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.entrypoints.llm import LLM

model_name_or_path = "./models/llama-7b"
model_name_or_path = "/root/.paddlenlp/models/Qwen/Qwen3-8B"
# model_name_or_path = "/home/zexuli/Models/Qwen3-0.6B"

# 超参设置
sampling_params = SamplingParams(temperature=0.1, max_tokens=30)
llm = LLM(model=model_name_or_path, tensor_parallel_size=1)
output = llm.generate(prompts="who are you?",
use_tqdm=True,
sampling_params=sampling_params)

print(output)
sampling_params = SamplingParams(temperature=0.1)
llm = LLM(model=model_name_or_path, tensor_parallel_size=2,reasoning_parser="qwen3")
prompt = "北京天安门在哪里?"
messages = [{"role": "user", "content": prompt}]
output = llm.chat([messages],
sampling_params)
print(output)
2 changes: 1 addition & 1 deletion fastdeploy/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
# Set attention backend. "NATIVE_ATTN", "APPEND_ATTN"
# and "MLA_ATTN" can be set currently.
"FD_ATTENTION_BACKEND":
lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN"),
lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN,NATIVE_ATTN").split(","),

# Set sampling class. "base", "air" and "rejection" can be set currently.
"FD_SAMPLING_CLASS":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def _get_attn_backend(selected_backend: str) -> object:
selected_backend = backend_name_to_enum(selected_backend)
attention_cls = current_platform.get_attention_backend_cls(
selected_backend)

print("attention_cls",attention_cls)
if not attention_cls:
raise ValueError(
f"Invalid attention backend for {current_platform.device_name}")
Expand All @@ -43,5 +43,8 @@ def _get_attn_backend(selected_backend: str) -> object:

def get_attention_backend() -> object:
"""Selects which attention backend."""
attention_backend = envs.FD_ATTENTION_BACKEND
return _get_attn_backend(attention_backend)
attention_backend,native_attention_backend = envs.FD_ATTENTION_BACKEND
if current_platform.is_cuda():
return _get_attn_backend(attention_backend)
elif current_platform.is_cpu():
return _get_attn_backend(native_attention_backend)
2 changes: 2 additions & 0 deletions fastdeploy/model_executor/layers/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def load_state_dict(self, state_dict: Dict[str,
Args:
state_dict (dict): A dictionary containing the checkpoint weights and biases.
"""

if self.tie_word_embeddings:
self.word_embeddings.weight.set_value(
get_tensor(state_dict[self.prefix + ".weight"]).astype(
Expand All @@ -131,6 +132,7 @@ def load_state_dict(self, state_dict: Dict[str,
get_tensor(state_dict.pop(self.prefix + ".weight")).astype(
paddle.get_default_dtype()))


def forward(self, ids_remove_padding=None) -> paddle.Tensor:
"""
Defines the forward computation of the layer.
Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request]):
self.share_inputs["input_ids"][idx:idx +
1, :length] = np.array(
request.prompt_token_ids)

# Use chunked prefill
if self.parallel_config.enable_chunked_prefill:
request.set("chunk_idx", 1)
Expand Down Expand Up @@ -714,6 +713,7 @@ def initialize_attn_backend(self) -> None:

# Get the attention backend
attn_cls = get_attention_backend()

attn_backend = attn_cls(self.fd_config,
kv_num_heads=self.model_config.kv_num_heads,
num_heads=num_heads,
Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/worker/worker_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@ def initialize_fd_config(config_or_args) -> FDConfig:
model_config.head_dim = model_config_dict["head_dim"]
paddle.set_default_dtype(config_or_args.dtype)
if 'tie_word_embeddings' in model_config_dict:
model_config_dict['tie_word_embeddings'] = model_config_dict.pop('tie_word_embeddings')
model_config.tie_word_embeddings = model_config_dict['tie_word_embeddings']

# Initialize all config components
device_config = DeviceConfig()
Expand Down
2 changes: 1 addition & 1 deletion test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@


# Read ports from environment variables; use default values if not set
FD_API_PORT = int(os.getenv("FD_API_PORT", 8188))
FD_API_PORT = int(os.getenv("FD_API_PORT", 8781))
FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8133))
FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233))

Expand Down
3 changes: 2 additions & 1 deletion test/layers/test_append_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def _apply_rope(self, rotary_emb, q, k, v=None, causal=False):
# sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]

if self.use_neox_rotary_style:
print("use_neox_rotary_style也有?")
sin_pos = sin
cos_pos = cos
# NeoX Stype:前后半部分分块旋转
Expand All @@ -92,7 +93,7 @@ def _apply_rope(self, rotary_emb, q, k, v=None, causal=False):
paddle.shape(k),
)
else:
# import pdb;pdb.set_trace()
print("跑的这里嘛")
sin_pos = paddle.reshape(paddle.stack(
[sin, sin], axis=-1), [1, 1, seq, head_dim])
# cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
Expand Down
Loading