Skip to content

Simplify the Config code #2770

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Jul 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
541 changes: 273 additions & 268 deletions fastdeploy/config.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads
self.head_dim: int = fd_config.model_config.head_dim
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers
self.max_partition_size: int = int(
os.getenv("FLAGS_max_partition_size", 32768))

Expand Down
4 changes: 2 additions & 2 deletions fastdeploy/model_executor/layers/attention/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ def __init__(
ValueError: If the `v_head_dim` is less than 0.
"""
super().__init__()
self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_degree
self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_size
self.head_dim: int = fd_config.model_config.head_dim
self.kv_num_heads: int = \
max(1, fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_degree)
max(1, fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_size)
self.layer_id: int = layer_id
self.v_head_dim: int = v_head_dim if v_head_dim > 0 else self.head_dim
self.rope_type: str = rope_type
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
self.head_dim = fd_config.model_config.head_dim
self.hidden_size = fd_config.model_config.hidden_size
self.block_size = fd_config.parallel_config.block_size
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers

self.speculative_method = fd_config.speculative_config.method
self.use_speculate = self.speculative_method is not None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def __init__(self, llm_config: FDConfig, kv_num_heads: int, num_heads: int,
self.head_dim = head_dim
# note: scale need to change if using MLA
self.attention_metadata.scale = 1.0 / sqrt(head_dim)
self.num_layers = llm_config.model_config.num_layers
self.num_layers = llm_config.model_config.num_hidden_layers
self.record_block_table_metadata = {}
self.only_use_flash_attn = int(
os.getenv("FD_ILUVATAR_ONLY_USE_FLASH_ATTN", 0)) == 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,18 +113,18 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads
self.head_dim: int = fd_config.model_config.head_dim
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers

# For Multi Head Latent Attention
self.kv_lora_rank: int = fd_config.model_config.deepseekv3.kv_lora_rank
self.qk_rope_head_dim: int = fd_config.model_config.deepseekv3.qk_rope_head_dim
self.qk_head_dim: int = fd_config.model_config.deepseekv3.qk_nope_head_dim \
+ fd_config.model_config.deepseekv3.qk_rope_head_dim
self.kv_lora_rank: int = fd_config.model_config.kv_lora_rank
self.qk_rope_head_dim: int = fd_config.model_config.qk_rope_head_dim
self.qk_head_dim: int = fd_config.model_config.qk_nope_head_dim \
+ fd_config.model_config.qk_rope_head_dim
self.attn_softmax_scale: float = self.qk_head_dim**-0.5
if fd_config.model_config.deepseekv3.rope_scaling:
mscale_all_dim = fd_config.model_config.deepseekv3.rope_scaling.get(
if fd_config.model_config.rope_scaling:
mscale_all_dim = fd_config.model_config.rope_scaling.get(
"mscale_all_dim", False) # 1.0
scaling_factor = fd_config.model_config.deepseekv3.rope_scaling[
scaling_factor = fd_config.model_config.rope_scaling[
"factor"] # 40
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
self.attn_softmax_scale = self.attn_softmax_scale * mscale * mscale
Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/model_executor/layers/attention/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def init_rank_and_device_id(fd_config: FDConfig):

"""
rank = (fd_config.parallel_config.expert_parallel_rank *
fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank)
fd_config.parallel_config.tensor_parallel_size + fd_config.parallel_config.tensor_parallel_rank)

cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads
self.head_dim: int = head_dim
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers

# pd_disaggregation
self.use_pd_disaggregation: int = int(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
self.num_heads = num_heads
self.head_dim = head_dim
self.scaling = 1.0 / (self.head_dim**0.5)
self.num_layers = fd_config.model_config.num_layers
self.num_layers = fd_config.model_config.num_hidden_layers
self.position_ids_base = paddle.arange(self.max_seq_len)

# TODO(zhengjun): Need to adapt the allocation logic and
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
self.num_heads = num_heads
self.head_dim = head_dim
self.scaling = 1.0 / (self.head_dim**0.5)
self.num_layers = fd_config.model_config.num_layers
self.num_layers = fd_config.model_config.num_hidden_layers
self.position_ids_base = paddle.arange(self.max_seq_len)

# TODO(zhengjun): Need to adapt the allocation logic and
Expand Down
11 changes: 1 addition & 10 deletions fastdeploy/model_executor/layers/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,11 @@ def __init__(
self.world_size: int = hcg.get_model_parallel_world_size()
self.ring_id: int = hcg.get_model_parallel_group().id
self.use_rope: bool = fd_config.model_config.use_rope
self.rope_head_dim: int = fd_config.model_config.rope_head_dim
self.use_ep: bool = fd_config.parallel_config.use_ep
self.hidden_dropout_prob: float = fd_config.model_config.hidden_dropout_prob
self.initializer_range: float = fd_config.model_config.initializer_range
self.sequence_parallel: bool = fd_config.parallel_config.sequence_parallel
self.max_position_embeddings: int = fd_config.model_config.max_position_embeddings
self.freeze_embedding: bool = fd_config.model_config.freeze_embedding
self.tie_word_embeddings: bool = fd_config.model_config.tie_word_embeddings
self.params_dtype: str = params_dtype

Expand Down Expand Up @@ -104,15 +102,7 @@ def __init__(
)

self.prefix = prefix

if self.freeze_embedding:
self.word_embeddings.weight.learning_rate = 0.0
if not self.use_rope:
self.position_embeddings.weight.learning_rate = 0.0

self.dropout = nn.Dropout(self.hidden_dropout_prob)
self.rope_head_dim_shape_tensor = paddle.ones((self.rope_head_dim),
dtype="int8")

def load_state_dict(self, state_dict: Dict[str,
paddle.Tensor | np.ndarray]):
Expand All @@ -122,6 +112,7 @@ def load_state_dict(self, state_dict: Dict[str,
Args:
state_dict (dict): A dictionary containing the checkpoint weights and biases.
"""
a = state_dict[self.prefix + ".weight"]
if self.tie_word_embeddings:
self.word_embeddings.weight.set_value(
get_tensor(state_dict[self.prefix + ".weight"]).astype(
Expand Down
10 changes: 5 additions & 5 deletions fastdeploy/model_executor/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def __init__(
with_bias=with_bias,
add_bias=add_bias,
skip_quant=skip_quant)
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.input_size = input_size
self.output_size = divide(
output_size,
Expand Down Expand Up @@ -348,7 +348,7 @@ def __init__(
"""
self.activation = activation
self.hidden_size = fd_config.model_config.hidden_size
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size

super().__init__(fd_config=fd_config,
prefix=prefix,
Expand Down Expand Up @@ -410,7 +410,7 @@ def __init__(self, fd_config, prefix, with_bias=False, add_bias=True):
self.kv_num_heads = fd_config.model_config.num_key_value_heads
self.hidden_size = fd_config.model_config.hidden_size
self.head_dim = fd_config.model_config.head_dim
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.num_heads_per_rank = divide(self.num_heads, self.nranks)
if self.kv_num_heads < self.nranks and self.nranks % self.kv_num_heads == 0:
self.kv_num_heads_per_rank = 1
Expand Down Expand Up @@ -545,7 +545,7 @@ def __init__(
skip_quant=skip_quant)
self.fd_config = fd_config
self.skip_quant = False
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.hidden_size = fd_config.model_config.hidden_size
self.head_dim = fd_config.model_config.head_dim
self.num_heads = fd_config.model_config.num_attention_heads // self.nranks
Expand Down Expand Up @@ -638,7 +638,7 @@ def __init__(
with_bias (bool): Whether to include bias or not. Defaults to False.
skip_quant (bool): Whether to skip quantization. Defaults to False.
"""
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.kv_lora_rank = kv_lora_rank
self.num_attention_heads = num_attention_heads
self.qk_nope_head_dim = qk_nope_head_dim
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def init_ep(self, layer: nn.Layer) -> None:
from .ep import EPDecoderRunner
self.ep_decoder_runner = EPDecoderRunner(
layer.top_k, layer.hidden_size, layer.num_experts,
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.model_config.num_max_dispatch_tokens_per_rank,
layer.ep_size, layer.ep_rank)
else:
from .ep import EPPrefillRunner
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
# limitations under the License.
"""

import numpy as np
import paddle
from paddle import nn
from paddleformers.utils.log import logger
Expand All @@ -23,8 +22,8 @@
import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func
from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func

from ..utils import create_and_set_parameter
from .fused_moe_backend_base import MoEMethodBase
Expand Down Expand Up @@ -242,7 +241,7 @@ def apply_ep_decode(
[
layer.num_local_experts,
layer.ep_size *
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.model_config.num_max_dispatch_tokens_per_rank,
layer.moe_intermediate_size * 2,
],
dtype=paddle.bfloat16,
Expand All @@ -252,7 +251,7 @@ def apply_ep_decode(
[
layer.num_local_experts,
layer.ep_size *
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.model_config.num_max_dispatch_tokens_per_rank,
layer.hidden_size,
],
dtype=paddle.bfloat16,
Expand Down
7 changes: 3 additions & 4 deletions fastdeploy/model_executor/layers/moe/moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,15 @@ def __init__(
self.layer_idx = layer_idx
self.reduce_results = reduce_results

self.tp_size = fd_config.parallel_config.tensor_parallel_degree
self.ep_size = fd_config.parallel_config.expert_parallel_degree
self.tp_size = fd_config.parallel_config.tensor_parallel_size
self.ep_size = fd_config.parallel_config.expert_parallel_size
self.ep_rank = fd_config.parallel_config.expert_parallel_rank

assert (self.tp_size >= 1 and self.ep_size == 1) or \
(self.tp_size == 1 and self.ep_size > 1), \
'MoE only support parallelism on TP or EP dimension.'

self.hidden_size = fd_config.model_config.hidden_size
self.moe_config = fd_config.moe_config
self.num_experts = num_experts
self.num_local_experts = self.num_experts // self.ep_size

Expand Down Expand Up @@ -141,7 +140,7 @@ def init_moe_weights(self):
shape=gate_weight_shape,
dtype="float32",
)
if self.moe_config.moe_use_aux_free:
if self.model_config.moe_use_aux_free:
self.gate_correction_bias = self.create_parameter(
shape=gate_correction_bias_shape,
dtype="float32",
Expand Down
6 changes: 3 additions & 3 deletions fastdeploy/model_executor/load_weight_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def load_ep_checkpoint(model_path: str,
filtered_map = {k: v for k, v in weight_list.items() if "experts" not in k}
num_local_ffn_keys = []

for i in range(config.moe_layer_start_index, config.num_layers):
for i in range(config.moe_layer_start_index, config.num_hidden_layers):
for j in range(
config.num_experts_start_offset,
config.num_experts_start_offset + config.num_experts_per_rank,
Expand Down Expand Up @@ -261,7 +261,7 @@ def load_composite_checkpoint(
and os.path.isdir(os.path.join(model_path, f))
]
if len(rank_dirs) > 1:
if fd_config.parallel_config.tensor_parallel_degree != len(
if fd_config.parallel_config.tensor_parallel_size != len(
rank_dirs):
raise ValueError(
f"Your model only supports loading with tp{len(rank_dirs)}"
Expand All @@ -283,7 +283,7 @@ def load_composite_checkpoint(
else:
state_dict = load_tp_checkpoint(model_path,
cls,
fd_config.model_config,
fd_config.model_config.pretrained_config,
return_numpy=return_numpy)
if not state_dict:
raise ValueError("weight not found in state_dict !")
Expand Down
Loading