Skip to content

Simplify the Config code #2770

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Jul 14, 2025
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
525 changes: 259 additions & 266 deletions fastdeploy/config.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads
self.head_dim: int = fd_config.model_config.head_dim
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers
self.max_partition_size: int = int(
os.getenv("FLAGS_max_partition_size", 32768))

Expand Down
4 changes: 2 additions & 2 deletions fastdeploy/model_executor/layers/attention/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ def __init__(
ValueError: If the `v_head_dim` is less than 0.
"""
super().__init__()
self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_degree
self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_size
self.head_dim: int = fd_config.model_config.head_dim
self.kv_num_heads: int = \
max(1, fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_degree)
max(1, fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_size)
self.layer_id: int = layer_id
self.v_head_dim: int = v_head_dim if v_head_dim > 0 else self.head_dim
self.rope_type: str = rope_type
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
self.head_dim = fd_config.model_config.head_dim
self.hidden_size = fd_config.model_config.hidden_size
self.block_size = fd_config.parallel_config.block_size
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers

self.speculative_method = fd_config.speculative_config.method
self.use_speculate = self.speculative_method is not None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def __init__(self, llm_config: FDConfig, kv_num_heads: int, num_heads: int,
self.head_dim = head_dim
# note: scale need to change if using MLA
self.attention_metadata.scale = 1.0 / sqrt(head_dim)
self.num_layers = llm_config.model_config.num_layers
self.num_layers = llm_config.model_config.num_hidden_layers
self.record_block_table_metadata = {}
self.only_use_flash_attn = int(
os.getenv("FD_ILUVATAR_ONLY_USE_FLASH_ATTN", 0)) == 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,18 +113,18 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads
self.head_dim: int = fd_config.model_config.head_dim
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers

# For Multi Head Latent Attention
self.kv_lora_rank: int = fd_config.model_config.deepseekv3.kv_lora_rank
self.qk_rope_head_dim: int = fd_config.model_config.deepseekv3.qk_rope_head_dim
self.qk_head_dim: int = fd_config.model_config.deepseekv3.qk_nope_head_dim \
+ fd_config.model_config.deepseekv3.qk_rope_head_dim
self.kv_lora_rank: int = fd_config.model_config.kv_lora_rank
self.qk_rope_head_dim: int = fd_config.model_config.qk_rope_head_dim
self.qk_head_dim: int = fd_config.model_config.qk_nope_head_dim \
+ fd_config.model_config.qk_rope_head_dim
self.attn_softmax_scale: float = self.qk_head_dim**-0.5
if fd_config.model_config.deepseekv3.rope_scaling:
mscale_all_dim = fd_config.model_config.deepseekv3.rope_scaling.get(
if fd_config.model_config.rope_scaling:
mscale_all_dim = fd_config.model_config.rope_scaling.get(
"mscale_all_dim", False) # 1.0
scaling_factor = fd_config.model_config.deepseekv3.rope_scaling[
scaling_factor = fd_config.model_config.rope_scaling[
"factor"] # 40
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
self.attn_softmax_scale = self.attn_softmax_scale * mscale * mscale
Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/model_executor/layers/attention/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def init_rank_and_device_id(fd_config: FDConfig):
"""
rank = (fd_config.parallel_config.expert_parallel_rank *
fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank)
fd_config.parallel_config.tensor_parallel_size + fd_config.parallel_config.tensor_parallel_rank)

cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads
self.head_dim: int = head_dim
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers

# pd_disaggregation
self.use_pd_disaggregation: int = int(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
self.num_heads = num_heads
self.head_dim = head_dim
self.scaling = 1.0 / (self.head_dim**0.5)
self.num_layers = fd_config.model_config.num_layers
self.num_layers = fd_config.model_config.num_hidden_layers
self.position_ids_base = paddle.arange(self.max_seq_len)

# TODO(zhengjun): Need to adapt the allocation logic and
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
self.num_heads = num_heads
self.head_dim = head_dim
self.scaling = 1.0 / (self.head_dim**0.5)
self.num_layers = fd_config.model_config.num_layers
self.num_layers = fd_config.model_config.num_hidden_layers
self.position_ids_base = paddle.arange(self.max_seq_len)

# TODO(zhengjun): Need to adapt the allocation logic and
Expand Down
10 changes: 0 additions & 10 deletions fastdeploy/model_executor/layers/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,11 @@ def __init__(
self.world_size: int = hcg.get_model_parallel_world_size()
self.ring_id: int = hcg.get_model_parallel_group().id
self.use_rope: bool = fd_config.model_config.use_rope
self.rope_head_dim: int = fd_config.model_config.rope_head_dim
self.use_ep: bool = fd_config.parallel_config.use_ep
self.hidden_dropout_prob: float = fd_config.model_config.hidden_dropout_prob
self.initializer_range: float = fd_config.model_config.initializer_range
self.sequence_parallel: bool = fd_config.parallel_config.sequence_parallel
self.max_position_embeddings: int = fd_config.model_config.max_position_embeddings
self.freeze_embedding: bool = fd_config.model_config.freeze_embedding
self.tie_word_embeddings: bool = fd_config.model_config.tie_word_embeddings
self.params_dtype: str = params_dtype

Expand Down Expand Up @@ -104,15 +102,7 @@ def __init__(
)

self.prefix = prefix

if self.freeze_embedding:
self.word_embeddings.weight.learning_rate = 0.0
if not self.use_rope:
self.position_embeddings.weight.learning_rate = 0.0

self.dropout = nn.Dropout(self.hidden_dropout_prob)
self.rope_head_dim_shape_tensor = paddle.ones((self.rope_head_dim),
dtype="int8")

def load_state_dict(self, state_dict: Dict[str,
paddle.Tensor | np.ndarray]):
Expand Down
10 changes: 5 additions & 5 deletions fastdeploy/model_executor/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def __init__(
with_bias=with_bias,
add_bias=add_bias,
skip_quant=skip_quant)
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.input_size = input_size
self.output_size = divide(
output_size,
Expand Down Expand Up @@ -348,7 +348,7 @@ def __init__(
"""
self.activation = activation
self.hidden_size = fd_config.model_config.hidden_size
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size

super().__init__(fd_config=fd_config,
prefix=prefix,
Expand Down Expand Up @@ -410,7 +410,7 @@ def __init__(self, fd_config, prefix, with_bias=False, add_bias=True):
self.kv_num_heads = fd_config.model_config.num_key_value_heads
self.hidden_size = fd_config.model_config.hidden_size
self.head_dim = fd_config.model_config.head_dim
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.num_heads_per_rank = divide(self.num_heads, self.nranks)
if self.kv_num_heads < self.nranks and self.nranks % self.kv_num_heads == 0:
self.kv_num_heads_per_rank = 1
Expand Down Expand Up @@ -538,7 +538,7 @@ def __init__(
skip_quant=skip_quant)
self.fd_config = fd_config
self.skip_quant = False
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.hidden_size = fd_config.model_config.hidden_size
self.head_dim = fd_config.model_config.head_dim
self.num_heads = fd_config.model_config.num_attention_heads // self.nranks
Expand Down Expand Up @@ -631,7 +631,7 @@ def __init__(
with_bias (bool): Whether to include bias or not. Defaults to False.
skip_quant (bool): Whether to skip quantization. Defaults to False.
"""
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.kv_lora_rank = kv_lora_rank
self.num_attention_heads = num_attention_heads
self.qk_nope_head_dim = qk_nope_head_dim
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def init_ep(self, layer: nn.Layer) -> None:
from .ep import EPDecoderRunner
self.ep_decoder_runner = EPDecoderRunner(
layer.top_k, layer.hidden_size, layer.num_experts,
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.model_config.num_max_dispatch_tokens_per_rank,
layer.ep_size, layer.ep_rank)
else:
from .ep import EPPrefillRunner
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
# limitations under the License.
"""

import numpy as np
import paddle
from paddle import nn
from paddleformers.utils.log import logger
Expand All @@ -23,8 +22,8 @@
import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func
from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func

from ..utils import create_and_set_parameter
from .fused_moe_backend_base import MoEMethodBase
Expand Down Expand Up @@ -242,7 +241,7 @@ def apply_ep_decode(
[
layer.num_local_experts,
layer.ep_size *
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.model_config.num_max_dispatch_tokens_per_rank,
layer.moe_intermediate_size * 2,
],
dtype=paddle.bfloat16,
Expand All @@ -252,7 +251,7 @@ def apply_ep_decode(
[
layer.num_local_experts,
layer.ep_size *
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.model_config.num_max_dispatch_tokens_per_rank,
layer.hidden_size,
],
dtype=paddle.bfloat16,
Expand Down
7 changes: 3 additions & 4 deletions fastdeploy/model_executor/layers/moe/moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,15 @@ def __init__(
self.layer_idx = layer_idx
self.reduce_results = reduce_results

self.tp_size = fd_config.parallel_config.tensor_parallel_degree
self.ep_size = fd_config.parallel_config.expert_parallel_degree
self.tp_size = fd_config.parallel_config.tensor_parallel_size
self.ep_size = fd_config.parallel_config.expert_parallel_size
self.ep_rank = fd_config.parallel_config.expert_parallel_rank

assert (self.tp_size >= 1 and self.ep_size == 1) or \
(self.tp_size == 1 and self.ep_size > 1), \
'MoE only support parallelism on TP or EP dimension.'

self.hidden_size = fd_config.model_config.hidden_size
self.moe_config = fd_config.moe_config
self.num_experts = num_experts
self.num_local_experts = self.num_experts // self.ep_size

Expand Down Expand Up @@ -141,7 +140,7 @@ def init_moe_weights(self):
shape=gate_weight_shape,
dtype="float32",
)
if self.moe_config.moe_use_aux_free:
if self.model_config.moe_use_aux_free:
self.gate_correction_bias = self.create_parameter(
shape=gate_correction_bias_shape,
dtype="float32",
Expand Down
4 changes: 2 additions & 2 deletions fastdeploy/model_executor/load_weight_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def load_ep_checkpoint(model_path: str,
filtered_map = {k: v for k, v in weight_list.items() if "experts" not in k}
num_local_ffn_keys = []

for i in range(config.moe_layer_start_index, config.num_layers):
for i in range(config.moe_layer_start_index, config.num_hidden_layers):
for j in range(
config.num_experts_start_offset,
config.num_experts_start_offset + config.num_experts_per_rank,
Expand Down Expand Up @@ -260,7 +260,7 @@ def load_composite_checkpoint(
and os.path.isdir(os.path.join(model_path, f))
]
if len(rank_dirs) > 1:
if fd_config.parallel_config.tensor_parallel_degree != len(
if fd_config.parallel_config.tensor_parallel_size != len(
rank_dirs):
raise ValueError(
f"Your model only supports loading with tp{len(rank_dirs)}"
Expand Down
42 changes: 21 additions & 21 deletions fastdeploy/model_executor/models/deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def __init__(self, fd_config: FDConfig, layer_id: int,
prefix: str) -> None:
super().__init__()

self.tp_size = fd_config.parallel_config.tensor_parallel_degree
self.tp_size = fd_config.parallel_config.tensor_parallel_size

weight_key_map = {
"gate_weight_key": f"{prefix}.gate.weight",
Expand All @@ -124,23 +124,23 @@ def __init__(self, fd_config: FDConfig, layer_id: int,
self.fused_moe = FusedMoE(
fd_config=fd_config,
reduce_results=False,
moe_intermediate_size=fd_config.model_config.deepseekv3.
moe_intermediate_size=fd_config.model_config.
moe_intermediate_size,
num_experts=fd_config.model_config.deepseekv3.n_routed_experts,
top_k=fd_config.model_config.deepseekv3.num_experts_per_tok,
topk_method=fd_config.model_config.deepseekv3.topk_method,
topk_group=fd_config.model_config.deepseekv3.topk_group,
n_group=fd_config.model_config.deepseekv3.n_group,
routed_scaling_factor=fd_config.model_config.deepseekv3.
num_experts=fd_config.model_config.n_routed_experts,
top_k=fd_config.model_config.num_experts_per_tok,
topk_method=fd_config.model_config.topk_method,
topk_group=fd_config.model_config.topk_group,
n_group=fd_config.model_config.n_group,
routed_scaling_factor=fd_config.model_config.
routed_scaling_factor,
layer_idx=layer_id,
weight_key_map=weight_key_map,
)

self.num_shared_experts = fd_config.model_config.deepseekv3.n_shared_experts
self.num_shared_experts = fd_config.model_config.n_shared_experts
shared_experts_intermediate_size = (
self.num_shared_experts *
fd_config.model_config.deepseekv3.moe_intermediate_size)
fd_config.model_config.moe_intermediate_size)

self.shared_experts = DeepSeekV3MLP(
fd_config=fd_config,
Expand Down Expand Up @@ -178,18 +178,18 @@ def __init__(self,
prefix: str = "") -> None:
super().__init__()

self.tp_size = fd_config.parallel_config.tensor_parallel_degree
self.tp_size = fd_config.parallel_config.tensor_parallel_size
self.hidden_size = fd_config.model_config.hidden_size
self.num_attention_heads = fd_config.model_config.num_attention_heads
self.num_attention_heads_tp = self.num_attention_heads // self.tp_size

# MLA
self.qk_nope_head_dim = fd_config.model_config.deepseekv3.qk_nope_head_dim
self.qk_rope_head_dim = fd_config.model_config.deepseekv3.qk_rope_head_dim
self.qk_nope_head_dim = fd_config.model_config.qk_nope_head_dim
self.qk_rope_head_dim = fd_config.model_config.qk_rope_head_dim
self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
self.v_head_dim = fd_config.model_config.deepseekv3.v_head_dim
self.q_lora_rank = fd_config.model_config.deepseekv3.q_lora_rank
self.kv_lora_rank = fd_config.model_config.deepseekv3.kv_lora_rank
self.v_head_dim = fd_config.model_config.v_head_dim
self.q_lora_rank = fd_config.model_config.q_lora_rank
self.kv_lora_rank = fd_config.model_config.kv_lora_rank

self.attn_softmax_scale = self.qk_head_dim**-0.5
self.rope_theta = fd_config.model_config.rope_theta
Expand Down Expand Up @@ -255,7 +255,7 @@ def __init__(self,
qk_nope_head_dim=self.qk_nope_head_dim,
v_head_dim=self.v_head_dim)

self.rope_scaling = fd_config.model_config.deepseekv3.rope_scaling
self.rope_scaling = fd_config.model_config.rope_scaling
if self.rope_scaling:
mscale_all_dim = self.rope_scaling.get("mscale_all_dim", False)
scaling_factor = self.rope_scaling["factor"]
Expand Down Expand Up @@ -449,9 +449,9 @@ def __init__(
prefix=f"{prefix}.self_attn",
)

if (fd_config.model_config.deepseekv3.n_routed_experts is not None
if (fd_config.model_config.n_routed_experts is not None
and layer_id
>= fd_config.model_config.deepseekv3.first_k_dense_replace):
>= fd_config.model_config.first_k_dense_replace):
self.mlp = DeepSeekV3MoE(
fd_config=fd_config,
layer_id=layer_id,
Expand Down Expand Up @@ -525,7 +525,7 @@ def __init__(
Initializer for the DeepSeekV3Model class.
"""
super().__init__()
self.num_layers = fd_config.model_config.num_layers
self.num_layers = fd_config.model_config.num_hidden_layers
fd_config.model_config.prefix_name = "deepseek_v3"

self.embeddings = VocabParallelEmbedding(
Expand Down Expand Up @@ -755,5 +755,5 @@ def get_tensor_parallel_split_mappings(num_layers):

return final_actions

mappings = get_tensor_parallel_split_mappings(config.num_layers)
mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
return mappings
Loading
Loading