From 65de2bafe0d3825cec87b76660c59df68afd7bae Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Wed, 9 Jul 2025 07:54:47 +0000 Subject: [PATCH 01/19] simplify the code --- fastdeploy/config.py | 502 +++++++++--------- fastdeploy/engine/engine.py | 8 +- .../layers/attention/append_attn_backend.py | 4 +- .../layers/attention/attention.py | 4 +- .../layers/attention/flash_attn_backend.py | 4 +- .../layers/attention/mla_attention_backend.py | 16 +- .../layers/attention/xpu_attn_backend.py | 2 +- .../model_executor/layers/embeddings.py | 10 - fastdeploy/model_executor/layers/linear.py | 10 +- .../layers/moe/fused_moe_backend_base.py | 2 +- .../layers/moe/fused_moe_deepgemm_backend.py | 7 +- fastdeploy/model_executor/layers/moe/moe.py | 7 +- .../model_executor/load_weight_utils.py | 2 +- .../model_executor/models/deepseek_v3.py | 42 +- .../model_executor/models/ernie4_5_moe.py | 26 +- .../model_executor/models/ernie4_5_mtp.py | 8 +- .../models/ernie4_5_vl/configuration.py | 4 +- .../models/ernie4_5_vl/ernie4_5_vl_moe.py | 50 +- .../model_executor/models/model_base.py | 2 +- fastdeploy/model_executor/models/qwen2.py | 10 +- fastdeploy/model_executor/models/qwen3.py | 6 +- fastdeploy/model_executor/models/qwen3moe.py | 22 +- fastdeploy/model_executor/models/tp_utils.py | 2 +- fastdeploy/rl/rollout_model.py | 10 +- fastdeploy/spec_decode/mtp.py | 14 +- fastdeploy/worker/gpu_model_runner.py | 28 +- fastdeploy/worker/vl_gpu_model_runner.py | 156 ++---- fastdeploy/worker/worker_process.py | 201 ++----- fastdeploy/worker/xpu_model_runner.py | 14 +- 29 files changed, 466 insertions(+), 707 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 446e59298d..51645ce49a 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -22,13 +22,13 @@ from paddleformers.transformers.configuration_utils import PretrainedConfig +from fastdeploy import envs from fastdeploy.model_executor.layers.quantization.quant_base import \ QuantConfigBase from fastdeploy.utils import get_logger logger = get_logger("config", "config.log") - class MoEPhase(Enum): """ The generation phase of the moe. @@ -37,268 +37,199 @@ class MoEPhase(Enum): PREFILL = 1 DECODER = 2 - -class ModelConfig(PretrainedConfig): +class ModelConfig: """ The configuration class to store the configuration of a `LLM`. """ - max_stop_seqs_num = 5 - stop_seqs_max_len = 8 - - architectures: list[str] = [] - - # NOTE(gongshaotain): form _load_model_init_val() - top_p = 0.0 - temperature = 1.0 - rope_theta = 10000.0 - penalty_score = 1.0 - frequency_score = 0.0 - presence_score = 0.0 - min_length = 1 - def __init__( self, - vocab_size: int = 100224, - hidden_size: int = 4096, - num_layers: int = 48, - num_attention_heads: int = 32, - num_key_value_heads: Optional[int] = None, - hidden_act: str = "swiglu", - hidden_dropout_prob: float = 0.0, - max_position_embeddings: int = 512, - max_seq_len: int = 512, - initializer_range: float = 0.02, - use_rope=True, - rope_theta: int = 10000, - rope_3d: bool = False, - ori_vocab_size: int | None = None, - moe_layer_start_index: int | None = None, - moe_layer_end_index: int | None = None, - num_hidden_layers: int | None = None, - prefix_name="", - freeze_embedding=False, - rope_head_dim=None, - ffn_hidden_size: Optional[int] = None, - dtype="bfloat16", - start_layer_index: int = 0, - head_dim: Optional[int] = None, - tie_word_embeddings: bool = False, - is_quantized: bool = False, - **kwargs, + args, ): - super().__init__(**kwargs) - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_layers = num_layers - if num_hidden_layers is not None: - self.num_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - if head_dim is None: + self.max_stop_seqs_num = 5 + self.stop_seqs_max_len = 8 + + # NOTE(gongshaotain): form _load_model_init_val() + self.top_p = 0.0 + self.temperature = 1.0 + self.rope_theta = 10000.0 + self.penalty_score = 1.0 + self.frequency_score = 0.0 + self.presence_score = 0.0 + self.min_length = 1 + self.model_name_or_path = "" + + self.im_patch_id = ( + 100295 # multimodality, TODO(liuyuanle): read from config.json + ) + self.is_quantized = False + self.max_model_len = 0 + self.dtype = "" + + PRETRAINED_INIT_CONFIGURATION = { + "rope_theta": 10000.0, + "num_key_value_heads":-1, + "start_layer_index": 0, + "moe_num_shared_experts":0, + "moe_layer_start_index": 0, + "num_max_dispatch_tokens_per_rank":256, + "moe_use_aux_free":False, + "vocab_size": -1, + "use_rope": True, + "hidden_dropout_prob":0.0, + "initializer_range":0.02, + "max_position_embeddings":512, + "quantization_config":None, + } + + for key, value in args.items(): + if hasattr(self, key): + setattr(self, key, value) + pretrained_config, _ = PretrainedConfig.get_config_dict(self.model_name_or_path) + + # set attribute from pretrained_config + for key, value in pretrained_config.items(): + setattr(self, key, value) + + # we need set default value when not exist + for key, value in PRETRAINED_INIT_CONFIGURATION.items(): + if not hasattr(self, key): + setattr(self, key, value) + + if not hasattr(self, "head_dim"): self.head_dim = self.hidden_size // self.num_attention_heads - else: - self.head_dim = head_dim - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.use_rope = use_rope - self.rope_theta = rope_theta - self.ori_vocab_size = ori_vocab_size or vocab_size - self.max_seq_len = max_seq_len - self.prefix_name = prefix_name - self.freeze_embedding = freeze_embedding - self.rope_head_dim = rope_head_dim - moe_num_experts = kwargs.get("moe_num_experts", 0) - if moe_layer_start_index is not None: - self.moe_layer_start_index = moe_layer_start_index - elif moe_num_experts == 0: - self.moe_layer_start_index = self.num_layers - self.moe_num_experts = 0 - if moe_layer_end_index is not None: - self.moe_layer_end_index = moe_layer_end_index - self.ffn_hidden_size = ffn_hidden_size - self.rope_3d = rope_3d - self.start_layer_index = start_layer_index - self.dtype = dtype - self.tie_word_embeddings = tie_word_embeddings - self.is_quantized = is_quantized + if "Ernie4_5_ForCausalLM" in self.architectures: + self.vocab_size = args["vocab_size"] -@dataclass -class MoEConfig: - """ - Configuration for MoE. - """ - num_experts: int = -1 - top_k: int = 8 - moe_intermediate_size: int = -1 - num_experts_per_rank: int = -1 - num_experts_start_offset: int = -1 - - moe_num_shared_experts = (0, ) - moe_layer_start_index = 0 - moe_layer_end_index = None - moe_use_aux_free: bool = False - num_max_dispatch_tokens_per_rank = 256 - im_patch_id = ( - 100295 # multimodality, TODO(liuyuanle): read from config.json - ) - - -@dataclass class ParallelConfig: """Configuration for the distributed execution.""" - block_size = 16 # The block size for processing. - sequence_parallel = False # Whether to enable sequence parallelism. - use_ep = False # Whether to enable Expert Parallelism - moe_phase = MoEPhase.PREFILL # Generation phase - msg_queue_id = 1 # mesage queue id - tensor_parallel_rank = None # TP rank ID - tensor_parallel_degree = None # TP degree - expert_parallel_rank = None # EP rank ID - expert_parallel_degree = None # EP degree - # The embedding weight distributed on your gpu cards is divided by row or column. - # Defaults to False means divide by row. When vocab_size can not be divided by world_size - # but hidden_size can, we can consider split embedding weight by column. - """ - From old wersion worker args - TODO(gongshaotian): Reclassify - """ - model_name_or_path: str = "./output" - max_num_seqs: int = 34 - # Set default block num for profile run - max_block_num: int = 2000 - # block size - block_size: int = 64 - # Engine worker queue port - engine_worker_queue_port: int = 9923 - # Max model len - max_model_len: int = 3072 # max_seq_len - # cuda visible devices - device_ids: str = "0" - # Input dtype - dtype: str = "bfloat16" - # Encoder's decoder num - enc_dec_block_num: int = 1 - # KV cache ratio for input - kv_cache_ratio: float = 0.7 - # First token id - first_token_id: int = 1 - # Gpu memory utilization - gpu_memory_utilization: float = 0.9 - # Process ID of engine - engine_pid: Optional[int] = None - # Do profile or not - do_profile: bool = False - # - pad_token_id: int = -1 - # - eos_tokens_lens: int = 2 - # Enable chunked prefill - enable_chunked_prefill: str = "store_true" - - max_num_batched_tokens: int = 2048 - # enable prefix cache - enable_prefix_caching = None - # splitwise role - splitwise_role: str = "mixed" - # guided decoding backend - guided_decoding_backend: str = None - # disable any whitespace for guided decoding - disable_any_whitespace: bool = True - + def __init__( + self, + args, + ): + self.block_size = 16 # The block size for processing. + self.sequence_parallel = False # Whether to enable sequence parallelism. + self.use_ep = False # Whether to enable Expert Parallelism + self.moe_phase = MoEPhase.PREFILL # Generation phase + self.msg_queue_id = 1 # mesage queue id + self.tensor_parallel_rank = None # TP rank ID + self.tensor_parallel_size = None # TP degree + self.expert_parallel_rank = None # EP rank ID + self.expert_parallel_size= None # EP degree + # The embedding weight distributed on your gpu cards is divided by row or column. + # Defaults to False means divide by row. When vocab_size can not be divided by world_size + # but hidden_size can, we can consider split embedding weight by column. + """ + From old wersion worker args + TODO(gongshaotian): Reclassify + """ + self.model_name_or_path: str = "./output" + self.max_num_seqs: int = 34 + # Set default block num for profile run + self.max_block_num: int = 2000 + # block size + self.block_size: int = 64 + # Engine worker queue port + self.engine_worker_queue_port: int = 9923 + # Max model len + self.max_model_len: int = 3072 # max_seq_len + # cuda visible devices + self.device_ids: str = "0" + # Input dtype + self.dtype: str = "bfloat16" + # Encoder's decoder num + self.enc_dec_block_num: int = 1 + # KV cache ratio for input + self.kv_cache_ratio: float = 0.7 + # First token id + self.first_token_id: int = 1 + # Gpu memory utilization + self.gpu_memory_utilization: float = 0.9 + # Process ID of engine + self.engine_pid: Optional[int] = None + # Do profile or not + self.do_profile: bool = False + # + self.pad_token_id: int = -1 + # + self.eos_tokens_lens: int = 2 + # Enable chunked prefill + self.enable_chunked_prefill: str = "store_true" + + self.max_num_batched_tokens: int = 2048 + # enable prefix cache + self.enable_prefix_caching = None + # splitwise role + self.splitwise_role: str = "mixed" + # guided decoding backend + self.guided_decoding_backend: str = None + # disable any whitespace for guided decoding + self.disable_any_whitespace: bool = True + self.pod_ip: str = None + for key, value in args.items(): + if hasattr(self, key): + setattr(self, key, value) + self.use_ep = args["expert_parallel_size"] > 1 + if self.splitwise_role == "mixed": + self.moe_phase = MoEPhase.PREFILL + elif self.splitwise_role == "prefill": + self.moe_phase = MoEPhase.PREFILL + elif self.splitwise_role == "decode": + self.moe_phase = MoEPhase.DECODER + else: + raise NotImplementedError @dataclass class SpeculativeConfig: """ Configuration for speculative decoding. """ - # speculative method, choose in [None, "ngram_match", "mtp"] - method: Optional[str] = None - # the max length of speculative tokens - num_speculative_tokens: int = 1 - # the max length of candidate tokens for speculative method - max_candidate_len: int = 5 - # the max length of verify window for speculative method - verify_window: int = 2 - # ngram match - max_ngram_size: int = 5 - # model for mtp/eagle/draft_model - model_name_or_path: Optional[str] = None - # quantization of model - quantization: Optional[str] = None - # allocate more blocks to prevent mtp from finishing the block earlier than the main model - # Fixed now - num_gpu_block_expand_ratio: Optional[float] = 1 - # To distinguish the main model and draft model(mtp/eagle/draftmodel) - # ["main", "mtp"] - model_type: Optional[str] = "main" - # TODO(liuzichang): To reduce memory usage, MTP shares the main model's lm_head and embedding layers. - # A trick method is currently used to enable this sharing. - # This will be replaced with a more standardized solution in the future. - sharing_model = None - + def __init__( + self, + args, + ): + # speculative method, choose in [None, "ngram_match", "mtp"] + self.method: Optional[str] = None + # the max length of speculative tokens + self.num_speculative_tokens: int = 1 + # the max length of candidate tokens for speculative method + self.max_candidate_len: int = 5 + # the max length of verify window for speculative method + self.verify_window: int = 2 + # ngram match + self.max_ngram_size: int = 5 + # model for mtp/eagle/draft_model + self.model_name_or_path: Optional[str] = None + # quantization of model + self.quantization: Optional[str] = None + # allocate more blocks to prevent mtp from finishing the block earlier than the main model + # Fixed now + self.num_gpu_block_expand_ratio: Optional[float] = 1 + # To distinguish the main model and draft model(mtp/eagle/draftmodel) + # ["main", "mtp"] + self.model_type: Optional[str] = "main" + # TODO(liuzichang): To reduce memory usage, MTP shares the main model's lm_head and embedding layers. + # A trick method is currently used to enable this sharing. + # This will be replaced with a more standardized solution in the future. + self.sharing_model = None + for key, value in args.items(): + if hasattr(self, key): + setattr(self, key, value) -@dataclass class DeviceConfig: """ Configuration for device settings. """ - device_type = "cuda" - + def __init__( + self, + args, + ): + self.device_type = "cuda" + for key, value in args.items(): + if hasattr(self, key): + setattr(self, key, value) class GraphOptimizationConfig: - """The Top-level graph optimization contral corresponds to different backends. - - 0: dyncmic graph - - 1: static graph - - 2: static graph + cinn compilation backend - """ - graph_opt_level: int = 0 - - # CUDA Graph Config - """ Whether to use cudagraph. - - False: cudagraph is not used. - - True: cudagraph is used. - It requires that all input buffers have fixed addresses, and all - splitting ops write their outputs to input buffers. - - With dyncmic graph backend: ... - - With static grpah backend: WIP - """ - use_cudagraph: bool = False - """Sizes to capture cudagraph. - - None (default): capture sizes are inferred from llm config. - - list[int]: capture sizes are specified as given.""" - cudagraph_capture_sizes: Optional[list[int]] = None - """ Number of warmup runs for cudagraph. """ - cudagraph_num_of_warmups: int = 2 - """Whether to copy input tensors for cudagraph. - If the caller can guarantee that the same input buffers - are always used, it can set this to False. Otherwise, it should - set this to True.""" - cudagraph_copy_inputs: bool = False - """ In static graph, this is an operation list that does not need to be captured by the CUDA graph. - CudaGraphBackend will split these operations from the static graph. - Example usage: - cudagraph_splitting_ops = ["paddle.unified_attention"] - - Note: If want to use subgraph capture functionality in a dynamic graph, - can manually split the model into multiple layers and apply the @support_cuda_graph decorator - only to the layer where CUDA graph functionality is required. - """ - cudagraph_splitting_ops = Optional[list[str]] - """"whether to use a full cuda graph for the entire forward pass rather than - splitting certain operations such as attention into subgraphs. - Thus this flag cannot be used together with splitting_ops.""" - full_cuda_graph: bool = False - - max_capture_size: int = field(default=None, init=False) # type: ignore - batch_size_to_captured_size: dict[int, - int] = field(default=None, - init=False) # type: ignore - - # CINN Config ... - def init_with_cudagrpah_size(self, cudagraph_capture_sizes: list[int]) -> None: """To complete the initialization of config, @@ -332,18 +263,67 @@ def init_with_cudagrpah_size(self, def __init__(self, enable_static_graph_inference: bool = False, - use_cudagraph: bool = False, - max_capture_batch_size: int = 64): - """ """ + max_capture_batch_size: int = 64, + args = None): + """The Top-level graph optimization contral corresponds to different backends. + - 0: dyncmic graph + - 1: static graph + - 2: static graph + cinn compilation backend + """ + self.graph_opt_level: int = 0 + + # CUDA Graph Config + """ Whether to use cudagraph. + - False: cudagraph is not used. + - True: cudagraph is used. + It requires that all input buffers have fixed addresses, and all + splitting ops write their outputs to input buffers. + - With dyncmic graph backend: ... + - With static grpah backend: WIP + """ + self.use_cudagraph: bool = False + """Sizes to capture cudagraph. + - None (default): capture sizes are inferred from llm config. + - list[int]: capture sizes are specified as given.""" + self.cudagraph_capture_sizes: Optional[list[int]] = None + """ Number of warmup runs for cudagraph. """ + self.cudagraph_num_of_warmups: int = 2 + """Whether to copy input tensors for cudagraph. + If the caller can guarantee that the same input buffers + are always used, it can set this to False. Otherwise, it should + set this to True.""" + self.cudagraph_copy_inputs: bool = False + """ In static graph, this is an operation list that does not need to be captured by the CUDA graph. + CudaGraphBackend will split these operations from the static graph. + Example usage: + cudagraph_splitting_ops = ["paddle.unified_attention"] + + Note: If want to use subgraph capture functionality in a dynamic graph, + can manually split the model into multiple layers and apply the @support_cuda_graph decorator + only to the layer where CUDA graph functionality is required. + """ + self.cudagraph_splitting_ops = Optional[list[str]] + """"whether to use a full cuda graph for the entire forward pass rather than + splitting certain operations such as attention into subgraphs. + Thus this flag cannot be used together with splitting_ops.""" + self.full_cuda_graph: bool = False + + self.max_capture_size: int = field(default=None, init=False) # type: ignore + self.batch_size_to_captured_size: dict[int, + int] = field(default=None, + init=False) # type: ignore + + # CINN Config ... + + for key, value in args.items(): + if hasattr(self, key): + setattr(self, key, value) capture_size = [i for i in range(1, max_capture_batch_size + 1)] self.init_with_cudagrpah_size(cudagraph_capture_sizes=capture_size) - self.use_cudagraph = use_cudagraph #TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn if enable_static_graph_inference: self.graph_opt_level = 1 - -@dataclass class LoadConfig: """ Configuration for dynamic weight loading strategies @@ -357,17 +337,16 @@ class LoadConfig: - 'meta': provide RL traing worker, no_weights_load - None: No dynamic loading """ - use_fastsafetensor: bool = False - dynamic_load_weight: bool = False - load_strategy: Optional[Literal['ipc', 'ipc_no_reshard', 'ipc_snapshot', 'meta']] = None - - def __post_init__(self): - if self.load_strategy is not None and not self.dynamic_load_weight: - raise ValueError("Load strategy requires dynamic_load_weight=True") - - if self.dynamic_load_weight and self.load_strategy is None: - raise ValueError("Must specify load_strategy when dynamic_load_weight is True") - + def __init__( + self, + args, + ): + self.use_fastsafetensor = int(envs.FD_USE_FASTSAFETENSOR) == 1 + self.dynamic_load_weight: bool = False + self.load_strategy: Optional[Literal['ipc', 'ipc_no_reshard', 'ipc_snapshot', 'meta']] = None + for key, value in args.items(): + if hasattr(self, key): + setattr(self, key, value) @dataclass class LoRAConfig: @@ -386,8 +365,14 @@ class DecodingConfig: """ Configuration for decoding """ - pad_token_id = None - + def __init__( + self, + args, + ): + self.pad_token_id = None + for key, value in args.items(): + if hasattr(self, key): + setattr(self, key, value) @dataclass class FDConfig: @@ -405,7 +390,6 @@ class FDConfig: load_config: LoadConfig = field(default=None, init=True) quant_config: Optional[QuantConfigBase] = None graph_opt_config: Optional[GraphOptimizationConfig] = None - moe_config: MoEConfig = field(default=None, init=True) # type: ignore decoding_config: DecodingConfig = field(default=None, init=True) # type: ignore kv_cache_config: KVCacheConfig = field(default=None, diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 5fca12f0b9..e0e95e4bf9 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -997,9 +997,9 @@ def _start_worker_service(self): worker_path = "../worker/vl_worker_process.py" py_script = os.path.join(current_dir_path, worker_path) - ori_vocab_size = ( - len(self.data_processor.tokenizer.sp_model) - if hasattr(self.data_processor.tokenizer, 'sp_model') + vocab_size = ( + len(self.data_processor.tokenizer.sp_model) + if hasattr(self.data_processor.tokenizer, 'sp_model') else len(self.data_processor.tokenizer.vocab) ) @@ -1025,7 +1025,7 @@ def _start_worker_service(self): f" --kv_cache_ratio {self.cfg.cache_config.kv_cache_ratio}" f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}" f" --quantization {self.cfg.model_config.quantization}" - f" --ori_vocab_size {ori_vocab_size}" + f" --vocab_size {vocab_size}" f" --speculative_method {self.cfg.speculative_config.method}" f" --speculative_max_draft_token_num {self.cfg.speculative_config.num_speculative_tokens}" f" --speculative_model_name_or_path {self.cfg.speculative_config.model_name_or_path}" diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py index 5bc7f420aa..91ace1ef99 100644 --- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py @@ -96,7 +96,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, self.kv_num_heads: int = kv_num_heads self.num_heads: int = num_heads self.head_dim: int = fd_config.model_config.head_dim - self.num_layers: int = fd_config.model_config.num_layers + self.num_layers: int = fd_config.model_config.num_hidden_layers self.max_partition_size: int = int( os.getenv("FLAGS_max_partition_size", 32768)) @@ -108,7 +108,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, if fd_config.parallel_config.expert_parallel_rank is None: fd_config.parallel_config.expert_parallel_rank = 0 - device_id = self.rank + fd_config.parallel_config.tensor_parallel_degree * \ + device_id = self.rank + fd_config.parallel_config.tensor_parallel_size * \ fd_config.parallel_config.expert_parallel_rank if self.device_id is None: self.device_id = device_id diff --git a/fastdeploy/model_executor/layers/attention/attention.py b/fastdeploy/model_executor/layers/attention/attention.py index 3f676f0317..736ebea62e 100644 --- a/fastdeploy/model_executor/layers/attention/attention.py +++ b/fastdeploy/model_executor/layers/attention/attention.py @@ -64,10 +64,10 @@ def __init__( ValueError: If the `v_head_dim` is less than 0. """ super().__init__() - self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_degree + self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_size self.head_dim: int = fd_config.model_config.head_dim self.kv_num_heads: int = \ - fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_degree + fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_size self.layer_id: int = layer_id self.v_head_dim: int = v_head_dim if v_head_dim > 0 else self.head_dim self.rope_type: str = rope_type diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py index 74a234bd19..c21edc7b21 100644 --- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py @@ -90,7 +90,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, self.head_dim = fd_config.model_config.head_dim self.hidden_size = fd_config.model_config.hidden_size self.block_size = fd_config.parallel_config.block_size - self.num_layers: int = fd_config.model_config.num_layers + self.num_layers: int = fd_config.model_config.num_hidden_layers self.speculative_method = fd_config.speculative_config.method self.use_speculate = self.speculative_method is not None @@ -106,7 +106,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, if fd_config.parallel_config.expert_parallel_rank is None: fd_config.parallel_config.expert_parallel_rank = 0 - device_id = self.rank + fd_config.parallel_config.tensor_parallel_degree * \ + device_id = self.rank + fd_config.parallel_config.tensor_parallel_size * \ fd_config.parallel_config.expert_parallel_rank if self.device_id is None: self.device_id = device_id diff --git a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py index 1d9c9773be..a704d05892 100644 --- a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py +++ b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py @@ -114,18 +114,18 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, self.kv_num_heads: int = kv_num_heads self.num_heads: int = num_heads self.head_dim: int = fd_config.model_config.head_dim - self.num_layers: int = fd_config.model_config.num_layers + self.num_layers: int = fd_config.model_config.num_hidden_layers # For Multi Head Latent Attention - self.kv_lora_rank: int = fd_config.model_config.deepseekv3.kv_lora_rank - self.qk_rope_head_dim: int = fd_config.model_config.deepseekv3.qk_rope_head_dim - self.qk_head_dim: int = fd_config.model_config.deepseekv3.qk_nope_head_dim \ - + fd_config.model_config.deepseekv3.qk_rope_head_dim + self.kv_lora_rank: int = fd_config.model_config.kv_lora_rank + self.qk_rope_head_dim: int = fd_config.model_config.qk_rope_head_dim + self.qk_head_dim: int = fd_config.model_config.qk_nope_head_dim \ + + fd_config.model_config.qk_rope_head_dim self.attn_softmax_scale: float = self.qk_head_dim**-0.5 - if fd_config.model_config.deepseekv3.rope_scaling: - mscale_all_dim = fd_config.model_config.deepseekv3.rope_scaling.get( + if fd_config.model_config.rope_scaling: + mscale_all_dim = fd_config.model_config.rope_scaling.get( "mscale_all_dim", False) # 1.0 - scaling_factor = fd_config.model_config.deepseekv3.rope_scaling[ + scaling_factor = fd_config.model_config.rope_scaling[ "factor"] # 40 mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.attn_softmax_scale = self.attn_softmax_scale * mscale * mscale diff --git a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py index 9ecc01fb89..a40b968106 100644 --- a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py @@ -96,7 +96,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, self.kv_num_heads: int = kv_num_heads self.num_heads: int = num_heads self.head_dim: int = head_dim - self.num_layers: int = fd_config.model_config.num_layers + self.num_layers: int = fd_config.model_config.num_hidden_layers # pd_disaggregation self.use_pd_disaggregation: int = int( diff --git a/fastdeploy/model_executor/layers/embeddings.py b/fastdeploy/model_executor/layers/embeddings.py index bc67cb1333..44b270cf61 100644 --- a/fastdeploy/model_executor/layers/embeddings.py +++ b/fastdeploy/model_executor/layers/embeddings.py @@ -59,13 +59,11 @@ def __init__( self.world_size: int = hcg.get_model_parallel_world_size() self.ring_id: int = hcg.get_model_parallel_group().id self.use_rope: bool = fd_config.model_config.use_rope - self.rope_head_dim: int = fd_config.model_config.rope_head_dim self.use_ep: bool = fd_config.parallel_config.use_ep self.hidden_dropout_prob: float = fd_config.model_config.hidden_dropout_prob self.initializer_range: float = fd_config.model_config.initializer_range self.sequence_parallel: bool = fd_config.parallel_config.sequence_parallel self.max_position_embeddings: int = fd_config.model_config.max_position_embeddings - self.freeze_embedding: bool = fd_config.model_config.freeze_embedding self.tie_word_embeddings: bool = fd_config.model_config.tie_word_embeddings self.params_dtype: str = params_dtype @@ -104,15 +102,7 @@ def __init__( ) self.prefix = prefix - - if self.freeze_embedding: - self.word_embeddings.weight.learning_rate = 0.0 - if not self.use_rope: - self.position_embeddings.weight.learning_rate = 0.0 - self.dropout = nn.Dropout(self.hidden_dropout_prob) - self.rope_head_dim_shape_tensor = paddle.ones((self.rope_head_dim), - dtype="int8") def load_state_dict(self, state_dict: Dict[str, paddle.Tensor | np.ndarray]): diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index b8dc49e1b0..5390b2ed82 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -265,7 +265,7 @@ def __init__( with_bias=with_bias, add_bias=add_bias, skip_quant=skip_quant) - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size self.input_size = input_size self.output_size = divide( output_size, @@ -347,7 +347,7 @@ def __init__( """ self.activation = activation self.hidden_size = fd_config.model_config.hidden_size - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size super().__init__(fd_config=fd_config, prefix=prefix, @@ -409,7 +409,7 @@ def __init__(self, fd_config, prefix, with_bias=False, add_bias=True): self.kv_num_heads = fd_config.model_config.num_key_value_heads self.hidden_size = fd_config.model_config.hidden_size self.head_dim = fd_config.model_config.head_dim - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size self.num_heads_per_rank = divide(self.num_heads, self.nranks) self.kv_num_heads_per_rank = divide(self.kv_num_heads, self.nranks) input_size = self.hidden_size @@ -532,7 +532,7 @@ def __init__( skip_quant=skip_quant) self.fd_config = fd_config self.skip_quant = False - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size self.hidden_size = fd_config.model_config.hidden_size self.head_dim = fd_config.model_config.head_dim self.num_heads = fd_config.model_config.num_attention_heads // self.nranks @@ -625,7 +625,7 @@ def __init__( with_bias (bool): Whether to include bias or not. Defaults to False. skip_quant (bool): Whether to skip quantization. Defaults to False. """ - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size self.kv_lora_rank = kv_lora_rank self.num_attention_heads = num_attention_heads self.qk_nope_head_dim = qk_nope_head_dim diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py index 3da7b783e4..d06b14e1bd 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py @@ -49,7 +49,7 @@ def init_ep(self, layer: nn.Layer) -> None: from .ep import EPDecoderRunner self.ep_decoder_runner = EPDecoderRunner( layer.top_k, layer.hidden_size, layer.num_experts, - layer.moe_config.num_max_dispatch_tokens_per_rank, + layer.model_config.num_max_dispatch_tokens_per_rank, layer.ep_size, layer.ep_rank) else: from .ep import EPPrefillRunner diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index c3bb8d3f1d..709e04f3c3 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -14,7 +14,6 @@ # limitations under the License. """ -import numpy as np import paddle from paddle import nn from paddleformers.utils.log import logger @@ -23,8 +22,8 @@ import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm from fastdeploy.distributed.communication_op import \ tensor_model_parallel_all_reduce -from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func from fastdeploy.model_executor.layers.utils import get_tensor +from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func from ..utils import create_and_set_parameter from .fused_moe_backend_base import MoEMethodBase @@ -237,7 +236,7 @@ def apply_ep_decode( [ layer.num_local_experts, layer.ep_size * - layer.moe_config.num_max_dispatch_tokens_per_rank, + layer.model_config.num_max_dispatch_tokens_per_rank, layer.moe_intermediate_size * 2, ], dtype=paddle.bfloat16, @@ -247,7 +246,7 @@ def apply_ep_decode( [ layer.num_local_experts, layer.ep_size * - layer.moe_config.num_max_dispatch_tokens_per_rank, + layer.model_config.num_max_dispatch_tokens_per_rank, layer.hidden_size, ], dtype=paddle.bfloat16, diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index a14b4e2cca..e1d1ea73f2 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -56,8 +56,8 @@ def __init__( self.layer_idx = layer_idx self.reduce_results = reduce_results - self.tp_size = fd_config.parallel_config.tensor_parallel_degree - self.ep_size = fd_config.parallel_config.expert_parallel_degree + self.tp_size = fd_config.parallel_config.tensor_parallel_size + self.ep_size = fd_config.parallel_config.expert_parallel_size self.ep_rank = fd_config.parallel_config.expert_parallel_rank assert (self.tp_size >= 1 and self.ep_size == 1) or \ @@ -65,7 +65,6 @@ def __init__( 'MoE only support parallelism on TP or EP dimension.' self.hidden_size = fd_config.model_config.hidden_size - self.moe_config = fd_config.moe_config self.num_experts = num_experts self.num_local_experts = self.num_experts // self.ep_size @@ -125,7 +124,7 @@ def init_moe_weights(self): shape=gate_weight_shape, dtype="float32", ) - if self.moe_config.moe_use_aux_free: + if self.model_config.moe_use_aux_free: self.gate_correction_bias = self.create_parameter( shape=gate_correction_bias_shape, dtype="float32", diff --git a/fastdeploy/model_executor/load_weight_utils.py b/fastdeploy/model_executor/load_weight_utils.py index c8ba1f673b..36b09cf30b 100644 --- a/fastdeploy/model_executor/load_weight_utils.py +++ b/fastdeploy/model_executor/load_weight_utils.py @@ -260,7 +260,7 @@ def load_composite_checkpoint( and os.path.isdir(os.path.join(model_path, f)) ] if len(rank_dirs) > 1: - if fd_config.parallel_config.tensor_parallel_degree != len( + if fd_config.parallel_config.tensor_parallel_size != len( rank_dirs): raise ValueError( f"Your model only supports loading with tp{len(rank_dirs)}" diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py index 73997c2acd..9844eeb6b1 100644 --- a/fastdeploy/model_executor/models/deepseek_v3.py +++ b/fastdeploy/model_executor/models/deepseek_v3.py @@ -109,7 +109,7 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str) -> None: super().__init__() - self.tp_size = fd_config.parallel_config.tensor_parallel_degree + self.tp_size = fd_config.parallel_config.tensor_parallel_size weight_key_map = { "gate_weight_key": f"{prefix}.gate.weight", @@ -124,23 +124,23 @@ def __init__(self, fd_config: FDConfig, layer_id: int, self.fused_moe = FusedMoE( fd_config=fd_config, reduce_results=False, - moe_intermediate_size=fd_config.model_config.deepseekv3. + moe_intermediate_size=fd_config.model_config. moe_intermediate_size, - num_experts=fd_config.model_config.deepseekv3.n_routed_experts, - top_k=fd_config.model_config.deepseekv3.num_experts_per_tok, - topk_method=fd_config.model_config.deepseekv3.topk_method, - topk_group=fd_config.model_config.deepseekv3.topk_group, - n_group=fd_config.model_config.deepseekv3.n_group, - routed_scaling_factor=fd_config.model_config.deepseekv3. + num_experts=fd_config.model_config.n_routed_experts, + top_k=fd_config.model_config.num_experts_per_tok, + topk_method=fd_config.model_config.topk_method, + topk_group=fd_config.model_config.topk_group, + n_group=fd_config.model_config.n_group, + routed_scaling_factor=fd_config.model_config. routed_scaling_factor, layer_idx=layer_id, weight_key_map=weight_key_map, ) - self.num_shared_experts = fd_config.model_config.deepseekv3.n_shared_experts + self.num_shared_experts = fd_config.model_config.n_shared_experts shared_experts_intermediate_size = ( self.num_shared_experts * - fd_config.model_config.deepseekv3.moe_intermediate_size) + fd_config.model_config.moe_intermediate_size) self.shared_experts = DeepSeekV3MLP( fd_config=fd_config, @@ -178,18 +178,18 @@ def __init__(self, prefix: str = "") -> None: super().__init__() - self.tp_size = fd_config.parallel_config.tensor_parallel_degree + self.tp_size = fd_config.parallel_config.tensor_parallel_size self.hidden_size = fd_config.model_config.hidden_size self.num_attention_heads = fd_config.model_config.num_attention_heads self.num_attention_heads_tp = self.num_attention_heads // self.tp_size # MLA - self.qk_nope_head_dim = fd_config.model_config.deepseekv3.qk_nope_head_dim - self.qk_rope_head_dim = fd_config.model_config.deepseekv3.qk_rope_head_dim + self.qk_nope_head_dim = fd_config.model_config.qk_nope_head_dim + self.qk_rope_head_dim = fd_config.model_config.qk_rope_head_dim self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim - self.v_head_dim = fd_config.model_config.deepseekv3.v_head_dim - self.q_lora_rank = fd_config.model_config.deepseekv3.q_lora_rank - self.kv_lora_rank = fd_config.model_config.deepseekv3.kv_lora_rank + self.v_head_dim = fd_config.model_config.v_head_dim + self.q_lora_rank = fd_config.model_config.q_lora_rank + self.kv_lora_rank = fd_config.model_config.kv_lora_rank self.attn_softmax_scale = self.qk_head_dim**-0.5 self.rope_theta = fd_config.model_config.rope_theta @@ -255,7 +255,7 @@ def __init__(self, qk_nope_head_dim=self.qk_nope_head_dim, v_head_dim=self.v_head_dim) - self.rope_scaling = fd_config.model_config.deepseekv3.rope_scaling + self.rope_scaling = fd_config.model_config.rope_scaling if self.rope_scaling: mscale_all_dim = self.rope_scaling.get("mscale_all_dim", False) scaling_factor = self.rope_scaling["factor"] @@ -453,9 +453,9 @@ def __init__( prefix=f"{prefix}.self_attn", ) - if (fd_config.model_config.deepseekv3.n_routed_experts is not None + if (fd_config.model_config.n_routed_experts is not None and layer_id - >= fd_config.model_config.deepseekv3.first_k_dense_replace): + >= fd_config.model_config.first_k_dense_replace): self.mlp = DeepSeekV3MoE( fd_config=fd_config, layer_id=layer_id, @@ -529,7 +529,7 @@ def __init__( Initializer for the DeepSeekV3Model class. """ super().__init__() - self.num_layers = fd_config.model_config.num_layers + self.num_layers = fd_config.model_config.num_hidden_layers fd_config.model_config.prefix_name = "deepseek_v3" self.embeddings = VocabParallelEmbedding( @@ -620,7 +620,7 @@ def __init__(self, fd_config: FDConfig): """ super().__init__(fd_config) self.model = DeepSeekV3Model(fd_config) - self.ori_vocab_size = fd_config.model_config.ori_vocab_size + self.ori_vocab_size = fd_config.model_config.vocab_size self.lm_head = ParallelLMHead( fd_config, embedding_dim=fd_config.model_config.hidden_size, diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index f6b73622a9..f6cf1cfa1d 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -54,7 +54,7 @@ def __init__( reduce_results: bool = True, ) -> None: super().__init__() - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size self.gate_up_proj = MergedColumnParallelLinear( fd_config=fd_config, prefix=f"{prefix}.up_gate_proj", @@ -179,16 +179,16 @@ def __init__(self, fd_config: FDConfig, layer_id: int, self.fused_moe = FusedMoE( fd_config=fd_config, - moe_intermediate_size=fd_config.moe_config.moe_intermediate_size, - num_experts=fd_config.moe_config.num_experts, - top_k=fd_config.moe_config.top_k, + moe_intermediate_size=fd_config.model_config.moe_intermediate_size, + num_experts=fd_config.model_config.moe_num_experts, + top_k=fd_config.model_config.moe_k, layer_idx=layer_id, weight_key_map=weight_key_map, ) - self.num_shared_experts = fd_config.moe_config.moe_num_shared_experts + self.num_shared_experts = fd_config.model_config.moe_num_shared_experts if self.num_shared_experts > 0: - shared_experts_hidden_dim = self.num_shared_experts * fd_config.moe_config.moe_intermediate_size + shared_experts_hidden_dim = self.num_shared_experts * fd_config.model_config.moe_intermediate_size self.shared_experts = Ernie4_5_MLP( fd_config=fd_config, intermediate_size=shared_experts_hidden_dim, @@ -271,8 +271,8 @@ def __init__( prefix=f"{prefix}.self_attn", ) - if (fd_config.moe_config.num_experts is not None - and layer_id >= fd_config.moe_config.moe_layer_start_index): + if (fd_config.model_config.moe_num_experts is not None + and layer_id >= fd_config.model_config.moe_layer_start_index): self.mlp = Ernie4_5_MoE( fd_config=fd_config, layer_id=layer_id, @@ -281,7 +281,7 @@ def __init__( else: self.mlp = Ernie4_5_MLP( fd_config=fd_config, - intermediate_size=fd_config.model_config.ffn_hidden_size, + intermediate_size=fd_config.model_config.intermediate_size, prefix=f"{prefix}.mlp", ) @@ -346,7 +346,7 @@ def __init__( """ super().__init__() - self.num_layers = fd_config.model_config.num_layers + self.num_layers = fd_config.model_config.num_hidden_layers fd_config.model_config.prefix_name = "ernie" self.embeddings = VocabParallelEmbedding( @@ -419,7 +419,7 @@ def __init__(self, fd_config: FDConfig): self.fd_config = fd_config self.model = Ernie4_5_Model(fd_config=fd_config) - self.ori_vocab_size = fd_config.model_config.ori_vocab_size + self.ori_vocab_size = fd_config.model_config.vocab_size self.lm_head = ParallelLMHead( fd_config=fd_config, @@ -466,8 +466,8 @@ def empty_input_forward(self): shape=[0, self.fd_config.model_config.hidden_size], dtype=paddle.get_default_dtype(), ) - for i in range(self.fd_config.moe_config.moe_layer_start_index, - self.fd_config.model_config.num_layers): + for i in range(self.fd_config.model_config.moe_layer_start_index, + self.fd_config.model_config.num_hidden_layers): self.model.hidden_layers[i].mlp.fused_moe(fake_hidden_states) def forward( diff --git a/fastdeploy/model_executor/models/ernie4_5_mtp.py b/fastdeploy/model_executor/models/ernie4_5_mtp.py index 029becc1e4..5e032cdd5c 100644 --- a/fastdeploy/model_executor/models/ernie4_5_mtp.py +++ b/fastdeploy/model_executor/models/ernie4_5_mtp.py @@ -262,7 +262,7 @@ def __init__( """ super().__init__() - self.num_layers = fd_config.model_config.num_layers + self.num_layers = fd_config.model_config.num_hidden_layers self.embeddings = fd_config.speculative_config.sharing_model.model.embeddings self.hidden_layers = nn.LayerList([ @@ -351,7 +351,7 @@ def __init__(self, fd_config: FDConfig): self.fd_config = fd_config self.model = Ernie4_5_MTPModel(fd_config=fd_config) - self.ori_vocab_size = fd_config.model_config.ori_vocab_size + self.ori_vocab_size = fd_config.model_config.vocab_size self.lm_head = fd_config.speculative_config.sharing_model.lm_head self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings @@ -398,8 +398,8 @@ def empty_input_forward(self): shape=[0, self.fd_config.model_config.hidden_size], dtype=paddle.get_default_dtype(), ) - for i in range(self.fd_config.moe_config.moe_layer_start_index, - self.fd_config.model_config.num_layers): + for i in range(self.fd_config.model_config.moe_layer_start_index, + self.fd_config.model_config.num_hidden_layers): self.model.hidden_layers[i].mlp.fused_moe(fake_hidden_states) def forward( diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/configuration.py b/fastdeploy/model_executor/models/ernie4_5_vl/configuration.py index f25742d3c2..c7812e258b 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/configuration.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/configuration.py @@ -16,7 +16,7 @@ import copy -from fastdeploy.config import ModelConfig +from paddleformers.transformers.configuration_utils import PretrainedConfig from .dfnrope.modeling import DFNRopeVisionTransformerConfig @@ -25,7 +25,7 @@ ] -class Ernie4_5_VLMoeConfig(ModelConfig): +class Ernie4_5_VLMoeConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`~ErnieModel`]. It is used to instantiate an Ernie model according to the specified arguments, defining the model architecture. Instantiating a configuration with the diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index a08433a570..68d059906c 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -68,8 +68,8 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str) -> None: super().__init__() - self.tp_size = fd_config.parallel_config.tensor_parallel_degree - moe_layer_start_index = fd_config.moe_config.moe_layer_start_index + self.tp_size = fd_config.parallel_config.tensor_parallel_size + moe_layer_start_index = fd_config.model_config.moe_layer_start_index if isinstance(moe_layer_start_index, int): text_moe_layer_start_index = moe_layer_start_index image_moe_layer_start_index = moe_layer_start_index @@ -77,10 +77,10 @@ def __init__(self, fd_config: FDConfig, layer_id: int, text_moe_layer_start_index = moe_layer_start_index[0] image_moe_layer_start_index = moe_layer_start_index[1] - moe_layer_end_index = fd_config.moe_config.moe_layer_end_index + moe_layer_end_index = fd_config.model_config.moe_layer_end_index if moe_layer_end_index is None: - text_moe_layer_end_index = fd_config.model_config.num_layers - image_moe_layer_end_index = fd_config.model_config.num_layers + text_moe_layer_end_index = fd_config.model_config.num_hidden_layers + image_moe_layer_end_index = fd_config.model_config.num_hidden_layers elif isinstance(moe_layer_end_index, int): text_moe_layer_end_index = moe_layer_end_index image_moe_layer_end_index = moe_layer_end_index @@ -103,11 +103,11 @@ def __init__(self, fd_config: FDConfig, layer_id: int, self.mlp_text = FusedMoE( fd_config=fd_config, reduce_results=False, - moe_intermediate_size=fd_config.moe_config. + moe_intermediate_size=fd_config.model_config. moe_intermediate_size[0], - num_experts=fd_config.moe_config.num_experts[0], + num_experts=fd_config.model_config.moe_num_experts[0], expert_id_offset=0, - top_k=fd_config.moe_config.top_k, + top_k=fd_config.model_config.moe_k, layer_idx=layer_id, moe_tag="Text", weight_key_map=weight_key_map, @@ -116,7 +116,7 @@ def __init__(self, fd_config: FDConfig, layer_id: int, else: self.mlp_text = Ernie4_5_VLMLP( fd_config=fd_config, - intermediate_size=fd_config.model_config.ffn_hidden_size, + intermediate_size=fd_config.model_config.intermediate_size, prefix=f"{prefix}", ) @@ -135,11 +135,11 @@ def __init__(self, fd_config: FDConfig, layer_id: int, self.mlp_image = FusedMoE( fd_config=fd_config, reduce_results=False, - moe_intermediate_size=fd_config.moe_config. + moe_intermediate_size=fd_config.model_config. moe_intermediate_size[1], - num_experts=fd_config.moe_config.num_experts[1], - expert_id_offset=fd_config.moe_config.num_experts[0], - top_k=fd_config.moe_config.top_k, + num_experts=fd_config.model_config.moe_num_experts[1], + expert_id_offset=fd_config.model_config.moe_num_experts[0], + top_k=fd_config.model_config.moe_k, layer_idx=layer_id, moe_tag="Image", weight_key_map=weight_key_map, @@ -148,16 +148,16 @@ def __init__(self, fd_config: FDConfig, layer_id: int, else: self.mlp_image = Ernie4_5_VLMLP( fd_config=fd_config, - intermediate_size=fd_config.model_config.ffn_hidden_size, + intermediate_size=fd_config.model_config.intermediate_size, prefix=f"{prefix}", ) - self.num_shared_experts = fd_config.moe_config.moe_num_shared_experts + self.num_shared_experts = fd_config.model_config.moe_num_shared_experts if self.num_shared_experts > 0: self.share_experts = Ernie4_5_VLMLP( fd_config=fd_config, intermediate_size=self.num_shared_experts * - fd_config.moe_config.moe_intermediate_size[0], + fd_config.model_config.moe_intermediate_size[0], prefix=f"{prefix}.shared_experts", reduce_results=False, ) @@ -231,15 +231,15 @@ def __init__( super().__init__() layer_id = int(prefix.split(sep='.')[-1]) - moe_layer_start_index = fd_config.moe_config.moe_layer_start_index + moe_layer_start_index = fd_config.model_config.moe_layer_start_index if isinstance(moe_layer_start_index, list): min_moe_layer_start_index = min(moe_layer_start_index) else: min_moe_layer_start_index = moe_layer_start_index - max_moe_layer_end_index = fd_config.model_config.num_layers - if fd_config.moe_config.moe_layer_end_index is not None: - moe_layer_end_index = fd_config.moe_config.moe_layer_end_index + max_moe_layer_end_index = fd_config.model_config.num_hidden_layers + if fd_config.model_config.moe_layer_end_index is not None: + moe_layer_end_index = fd_config.model_config.moe_layer_end_index if isinstance(moe_layer_start_index, list): max_moe_layer_end_index = max(moe_layer_end_index) else: @@ -253,7 +253,7 @@ def __init__( assert min_moe_layer_start_index <= max_moe_layer_end_index - if (fd_config.moe_config.num_experts is not None + if (fd_config.model_config.moe_num_experts is not None and layer_id >= min_moe_layer_start_index and layer_id <= max_moe_layer_end_index): self.mlp = Ernie4_5_VLMoE( @@ -264,7 +264,7 @@ def __init__( else: self.mlp = Ernie4_5_VLMLP( fd_config=fd_config, - intermediate_size=fd_config.model_config.ffn_hidden_size, + intermediate_size=fd_config.model_config.intermediate_size, prefix=f"{prefix}.mlp", ) @@ -332,8 +332,8 @@ def __init__( """ super().__init__() - self.num_layers = fd_config.model_config.num_layers - self.im_patch_id = fd_config.moe_config.im_patch_id + self.num_layers = fd_config.model_config.num_hidden_layers + self.im_patch_id = fd_config.model_config.im_patch_id self._dtype = fd_config.model_config.dtype fd_config.model_config.prefix_name = "ernie" @@ -468,7 +468,7 @@ def __init__(self, fd_config: FDConfig): self.model = Ernie4_5_VLModel(fd_config=fd_config) - self.ori_vocab_size = fd_config.model_config.ori_vocab_size + self.ori_vocab_size = fd_config.model_config.vocab_size self.lm_head = ParallelLMHead( fd_config=fd_config, diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py index 2e252fd0ec..a9da513fb4 100644 --- a/fastdeploy/model_executor/models/model_base.py +++ b/fastdeploy/model_executor/models/model_base.py @@ -53,7 +53,7 @@ def __init__(self, configs): """ Args: configs (dict): Configurations including parameters such as max_dec_len, min_dec_len, decode_strategy, - ori_vocab_size, use_topp_sampling, etc. + vocab_size, use_topp_sampling, etc. """ super(ModelForCasualLM, self).__init__() diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py index 0a5912afb0..760f39e0a3 100644 --- a/fastdeploy/model_executor/models/qwen2.py +++ b/fastdeploy/model_executor/models/qwen2.py @@ -47,12 +47,12 @@ def __init__( prefix: str = "", ) -> None: super().__init__() - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size self.gate_up_proj = MergedColumnParallelLinear( fd_config=fd_config, prefix=f"{prefix}.up_gate_proj", input_size=fd_config.model_config.hidden_size, - output_size=fd_config.model_config.ffn_hidden_size * 2, + output_size=fd_config.model_config.intermediate_size * 2, with_bias=False, activation=fd_config.model_config.hidden_act, ) @@ -60,7 +60,7 @@ def __init__( self.down_proj = RowParallelLinear( fd_config=fd_config, prefix=f"{prefix}.down_proj", - input_size=fd_config.model_config.ffn_hidden_size, + input_size=fd_config.model_config.intermediate_size, output_size=fd_config.model_config.hidden_size, with_bias=False, ) @@ -227,7 +227,7 @@ def __init__( """ super().__init__() - self.num_layers = fd_config.model_config.num_layers + self.num_layers = fd_config.model_config.num_hidden_layers fd_config.model_config.prefix_name = "qwen2" self.embeddings = VocabParallelEmbedding( @@ -304,7 +304,7 @@ def __init__(self, fd_config: FDConfig): self.model = Qwen2Model(fd_config=fd_config) - self.ori_vocab_size = fd_config.model_config.ori_vocab_size + self.ori_vocab_size = fd_config.model_config.vocab_size self.lm_head = ParallelLMHead( fd_config=fd_config, diff --git a/fastdeploy/model_executor/models/qwen3.py b/fastdeploy/model_executor/models/qwen3.py index c1654f4144..78eeb46632 100644 --- a/fastdeploy/model_executor/models/qwen3.py +++ b/fastdeploy/model_executor/models/qwen3.py @@ -56,7 +56,7 @@ def __init__(self, self.fd_config = fd_config self.head_dim = fd_config.model_config.head_dim - nranks = fd_config.parallel_config.tensor_parallel_degree + nranks = fd_config.parallel_config.tensor_parallel_size self.q_size = fd_config.model_config.num_attention_heads * self.head_dim // nranks self.kv_size = fd_config.model_config.num_key_value_heads * self.head_dim // nranks @@ -162,7 +162,7 @@ def __init__( """ super().__init__() - self.num_layers = fd_config.model_config.num_layers + self.num_layers = fd_config.model_config.num_hidden_layers fd_config.model_config.prefix_name = "model" fd_config.model_config.tie_word_embeddings = True @@ -239,7 +239,7 @@ def __init__(self, fd_config: FDConfig): self.model = Qwen3Model(fd_config=fd_config) - self.ori_vocab_size = fd_config.model_config.ori_vocab_size + self.ori_vocab_size = fd_config.model_config.vocab_size self.lm_head = ParallelLMHead( fd_config=fd_config, diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index c4d01ef6ea..705ee81d7f 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -48,13 +48,13 @@ def __init__( prefix: str = "", ) -> None: super().__init__() - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size self.gate_up_proj = MergedColumnParallelLinear( fd_config, prefix=f"{prefix}.up_gate_proj", input_size=fd_config.model_config.hidden_size, - output_size=fd_config.model_config.ffn_hidden_size * 2, + output_size=fd_config.model_config.intermediate_size * 2, with_bias=False, activation=fd_config.model_config.hidden_act, ) @@ -62,7 +62,7 @@ def __init__( self.down_proj = RowParallelLinear( fd_config, prefix=f"{prefix}.down_proj", - input_size=fd_config.model_config.ffn_hidden_size, + input_size=fd_config.model_config.intermediate_size, output_size=fd_config.model_config.hidden_size, with_bias=False, ) @@ -104,7 +104,7 @@ def __init__(self, self.qkv_proj = QKVParallelLinear(fd_config, prefix=f"{prefix}.qkv_proj", with_bias=False) - nranks = fd_config.parallel_config.tensor_parallel_degree + nranks = fd_config.parallel_config.tensor_parallel_size self.o_proj = RowParallelLinear( fd_config, @@ -199,14 +199,14 @@ def __init__( f"{prefix}.mlp.experts.{{}}.down_proj.weight", } - if (fd_config.moe_config.num_experts is not None - and layer_id >= fd_config.moe_config.moe_layer_start_index): + if (fd_config.model_config.moe_num_experts is not None + and layer_id >= fd_config.model_config.moe_layer_start_index): self.mlp = FusedMoE(fd_config, - moe_intermediate_size=fd_config.moe_config. + moe_intermediate_size=fd_config.model_config. moe_intermediate_size, - num_experts=fd_config.moe_config.num_experts, - top_k=fd_config.moe_config.top_k, + num_experts=fd_config.model_config.moe_num_experts, + top_k=fd_config.model_config.moe_topk, layer_idx=layer_id, weight_key_map=weight_key_map) else: @@ -283,7 +283,7 @@ def __init__( """ super().__init__() - self.num_layers = fd_config.model_config.num_layers + self.num_layers = fd_config.model_config.num_hidden_layers fd_config.model_config.prefix_name = "model" self.embeddings = VocabParallelEmbedding( @@ -358,7 +358,7 @@ def __init__(self, fd_config: FDConfig): self.model = Qwen3MoeModel(fd_config) - self.ori_vocab_size = fd_config.model_config.ori_vocab_size + self.ori_vocab_size = fd_config.model_config.vocab_size self.lm_head = ParallelLMHead( fd_config, diff --git a/fastdeploy/model_executor/models/tp_utils.py b/fastdeploy/model_executor/models/tp_utils.py index f360c5106f..2b35050ab5 100644 --- a/fastdeploy/model_executor/models/tp_utils.py +++ b/fastdeploy/model_executor/models/tp_utils.py @@ -36,7 +36,7 @@ def check_tensor_parallel_prerequisites( safetensor_keys: List[str], ) -> None: """check_tensor_parallel_prerequisites""" - if fd_config.parallel_config.tensor_parallel_degree > 1: + if fd_config.parallel_config.tensor_parallel_size > 1: tensor_parallel_map = cls._get_tensor_parallel_mappings( fd_config.model_config, is_split=True) if not tensor_parallel_map: diff --git a/fastdeploy/rl/rollout_model.py b/fastdeploy/rl/rollout_model.py index 53e4532746..8f0216b7ff 100644 --- a/fastdeploy/rl/rollout_model.py +++ b/fastdeploy/rl/rollout_model.py @@ -171,7 +171,7 @@ def _add_layer_mappings(layer_idx, is_moe_layer=False): infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.fused_moe.gate_weight"] = \ f"ernie.layers.{layer_idx}.mlp.gate.weight" - if self.fd_config.moe_config.moe_use_aux_free: + if self.fd_config.model_config.moe_use_aux_free: infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = \ f"ernie.layers.{layer_idx}.mlp.moe_statics.e_score_correction_bias" @@ -184,7 +184,7 @@ def _add_layer_mappings(layer_idx, is_moe_layer=False): f"ernie.layers.{layer_idx}.mlp.shared_experts.down_proj.weight" # MoE experts mappings - for expert_idx in range(self.fd_config.moe_config.num_experts): + for expert_idx in range(self.fd_config.model_config.moe_num_experts): for ph in place_holders: # FFN1 (up_gate_proj) ffn1_key = f"{infer_base_name}.{layer_idx}.mlp.fused_moe.moe_ffn1_weight" @@ -204,12 +204,12 @@ def _add_layer_mappings(layer_idx, is_moe_layer=False): # Process non-MoE layers for layer_idx in range( - self.fd_config.moe_config.moe_layer_start_index): + self.fd_config.model_config.moe_layer_start_index): _add_layer_mappings(layer_idx, is_moe_layer=False) # Process MoE layers - for layer_idx in range(self.fd_config.moe_config.moe_layer_start_index, - self.fd_config.model_config.num_layers): + for layer_idx in range(self.fd_config.model_config.moe_layer_start_index, + self.fd_config.model_config.num_hidden_layers): _add_layer_mappings(layer_idx, is_moe_layer=True) return infer_to_train diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index 97e8364451..6b8a0a363a 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -49,7 +49,7 @@ class MTPProposer(Proposer): def __init__(self, cfg, main_model, local_rank, device_id, main_model_inputs): super().__init__(cfg) - self.num_main_model_layers = self.model_config.num_layers + self.num_main_model_layers = self.model_config.num_hidden_layers self.local_rank = local_rank self.device_id = device_id self._update_cfg(main_model) @@ -70,7 +70,7 @@ def _update_cfg(self, main_model): """ self.model_config.architectures[0] = "Ernie4_5_MTPForCausalLM" self.speculative_config.sharing_model = main_model - self.model_config.num_layers = 1 + self.model_config.num_hidden_layers = 1 self.parallel_config.model_name_or_path = ( self.speculative_config.model_name_or_path) self.model_config.prefix_name = "ernie.mtp_block" @@ -145,7 +145,7 @@ def initialize_kv_cache(self): cache_kvs_list = [] for i in range( self.num_main_model_layers, - self.num_main_model_layers + self.model_config.num_layers): + self.num_main_model_layers + self.model_config.num_hidden_layers): key_cache = paddle.empty(shape=[], dtype=cache_type) key_cache_name = f"key_caches_{i}_rank{self.local_rank}.device{self.device_id}" val_cache_name = f"value_caches_{i}_rank{self.local_rank}.device{self.device_id}" @@ -159,7 +159,7 @@ def initialize_kv_cache(self): self.model_inputs["caches"] = cache_kvs_list else: - for i in range(self.model_config.num_layers): + for i in range(self.model_config.num_hidden_layers): self.cache_kvs["key_caches_{}".format(i)] = paddle.full( shape=kv_cache_shape, fill_value=0, @@ -183,10 +183,10 @@ def _initialize_attn_backend(self, ) -> None: # TODO(gongshaotian): Get rank from config num_heads = (self.model_config.num_attention_heads // - self.parallel_config.tensor_parallel_degree) + self.parallel_config.tensor_parallel_size) self.model_config.kv_num_heads = ( int(self.model_config.num_key_value_heads) // - self.parallel_config.tensor_parallel_degree) + self.parallel_config.tensor_parallel_size) head_dim = self.model_config.head_dim # Get the attention backend @@ -592,7 +592,7 @@ def _propose(self, target_hidden_states): self.model_inputs, ) - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampled_token_ids, 0) self._post_process(sampled_token_ids) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 8d6ca79a1b..37250cea39 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -257,7 +257,7 @@ def insert_prefill_inputs(self, req_dicts: List[Request]): self.share_inputs["min_dec_len"][idx:idx + 1] = request.get( "min_tokens", 1) self.share_inputs["max_dec_len"][idx:idx + 1] = request.get( - "max_tokens", self.model_config.max_length) + "max_tokens", self.model_config.max_model_len) self.share_inputs["stop_flags"][idx:idx + 1] = False self.share_inputs["first_token_ids"][ @@ -370,11 +370,11 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["min_dec_len"] = paddle.full( [max_num_seqs, 1], self.model_config.min_length, dtype='int64') self.share_inputs["max_dec_len"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_length, dtype='int64') + [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64') self.share_inputs["min_length"] = paddle.full( [max_num_seqs, 1], self.model_config.min_length, dtype='int64') self.share_inputs["max_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_length, dtype='int64') + [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64') self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype='int32') @@ -661,13 +661,13 @@ def initialize_kv_cache(self) -> None: # Get kv cache shape kv_cache_shape = self.attn_backends[0].get_kv_cache_shape( max_num_blocks=max_block_num) - local_rank = self.local_rank % self.parallel_config.tensor_parallel_degree + local_rank = self.local_rank % self.parallel_config.tensor_parallel_size if not self.parallel_config.do_profile and ( self.parallel_config.enable_prefix_caching \ or self.parallel_config.splitwise_role != "mixed"): cache_kvs_list = [] - for i in range(self.model_config.num_layers): + for i in range(self.model_config.num_hidden_layers): key_cache = paddle.empty(shape=[], dtype=cache_type) key_cache_name = f"key_caches_{i}_rank{local_rank}.device{self.device_id}" val_cache_name = f"value_caches_{i}_rank{local_rank}.device{self.device_id}" @@ -682,7 +682,7 @@ def initialize_kv_cache(self) -> None: self.share_inputs["caches"] = cache_kvs_list else: - for i in range(self.model_config.num_layers): + for i in range(self.model_config.num_hidden_layers): cache_kvs["key_caches_{}".format(i)] = paddle.full( shape=kv_cache_shape, @@ -706,10 +706,10 @@ def initialize_attn_backend(self) -> None: assert len(self.attn_backends) == 0 # TODO(gongshaotian): Get rank from config - num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree + num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size self.model_config.kv_num_heads = int( self.model_config.num_key_value_heads - ) // self.parallel_config.tensor_parallel_degree + ) // self.parallel_config.tensor_parallel_size head_dim = self.model_config.head_dim # Get the attention backend @@ -788,14 +788,14 @@ def _dummy_run(self, ) sampled_token_ids = self.sampler(logits, self.sampling_metadata) - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampled_token_ids, 0) else: self.sampler(logits, self.sampling_metadata, self.parallel_config.max_model_len, self.share_inputs) sampled_token_ids = None - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast( self.share_inputs["accept_tokens"], 0) paddle.distributed.broadcast( @@ -1026,14 +1026,14 @@ class at the server level, which is too granular for ModelRunner. self.sampling_metadata, skip_idx_list, ) - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampled_token_ids, 0) else: self.sampler(logits, self.sampling_metadata, self.parallel_config.max_model_len, self.share_inputs) sampled_token_ids = None - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast( self.share_inputs["accept_tokens"], 0) paddle.distributed.broadcast(self.share_inputs["accept_num"], @@ -1211,11 +1211,11 @@ def cal_theortical_kvcache(self): hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads # NOTE(liuzichang): Implement multi-layer MTP architecture in the future - num_layers = self.model_config.num_layers + \ + num_layers = self.model_config.num_hidden_layers + \ self.speculative_config.num_gpu_block_expand_ratio if \ self.speculative_method in [ "mtp" - ] else self.model_config.num_layers + ] else self.model_config.num_hidden_layers required_memory = ( byte_of_dtype * 2 * # k + v (self.parallel_config.block_size * hidden_dim) * num_layers) diff --git a/fastdeploy/worker/vl_gpu_model_runner.py b/fastdeploy/worker/vl_gpu_model_runner.py index f48cefe8f3..66be1d24fe 100644 --- a/fastdeploy/worker/vl_gpu_model_runner.py +++ b/fastdeploy/worker/vl_gpu_model_runner.py @@ -13,10 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. """ +import argparse import json import os import random -import argparse import numpy as np import paddle @@ -24,6 +24,9 @@ from paddleformers.transformers.model_utils import load_tp_checkpoint from safetensors import safe_open +from fastdeploy.config import (DecodingConfig, DeviceConfig, FDConfig, + LoadConfig, ModelConfig, MoEPhase, + ParallelConfig, SpeculativeConfig) from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.input.mm_processor import DataProcessor from fastdeploy.model_executor.layers.attention import get_attention_backend @@ -44,9 +47,6 @@ from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.worker.utils import check_safetensors_model from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase -from fastdeploy.config import (DeviceConfig, FDConfig, KVCacheConfig, - LoadConfig, ModelConfig, MoEConfig, - MoEPhase, ParallelConfig, SpeculativeConfig) if current_platform.is_cuda() and current_platform.available(): from fastdeploy.model_executor.layers.utils import ( @@ -283,10 +283,10 @@ def _load_model( self.fd_config = fd_config attn_backend_cls = get_attention_backend() num_heads = self.fd_config.model_config.num_attention_heads // \ - self.fd_config.parallel_config.tensor_parallel_degree + self.fd_config.parallel_config.tensor_parallel_size self.fd_config.model_config.kv_num_heads = int( self.fd_config.model_config.num_key_value_heads - ) // self.fd_config.parallel_config.tensor_parallel_degree + ) // self.fd_config.parallel_config.tensor_parallel_size head_dim = self.fd_config.model_config.head_dim self.attn_backend = attn_backend_cls( self.fd_config, @@ -847,7 +847,7 @@ def generate(self) -> None: ) # sampler & save_output next_tokens = self.sampler(logits, self.sampling_metadata) - if self.fd_config.parallel_config.tensor_parallel_degree > 1: + if self.fd_config.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(next_tokens, 0) self.post_process(next_tokens) @@ -1056,7 +1056,6 @@ def build_stream_line_model( """ import contextlib - from paddleformers.transformers.configuration_utils import PretrainedConfig from paddleformers.trl import llm_utils from paddleformers.utils.log import logger @@ -1064,120 +1063,41 @@ def build_stream_line_model( get_quantization_config from fastdeploy.model_executor.models.model_base import ModelRegistry - config, _ = PretrainedConfig.get_config_dict(model_path) - config["head_dim"] = config.get( - "head_dim", config["hidden_size"] // config["num_attention_heads"]) - config["rope_theta"] = config.get("rope_theta", 10000.0) - rope_theta = config["rope_theta"] - model_config = ModelConfig.from_dict(config) - model_config.head_dim = config["head_dim"] - - parallel_config = ParallelConfig() - speculative_config = SpeculativeConfig() - device_config = DeviceConfig() - load_config = LoadConfig() - moe_config = MoEConfig() - kv_cache_config = KVCacheConfig() - kv_cache_config.cache_quant_dtype = "none" - - tensor_parallel_rank, tensor_parallel_degree = llm_utils.init_dist_env() - parallel_config.tensor_parallel_rank = tensor_parallel_rank - parallel_config.tensor_parallel_degree = tensor_parallel_degree - parallel_config.tensor_parallel_degree = tensor_parallel_degree - parallel_config.expert_parallel_degree = 1 - parallel_config.expert_parallel_rank = int(tensor_parallel_rank / - tensor_parallel_degree) - parallel_config.column_cut = False - - speculative_config.is_mtp = False - speculative_config.draft_type = "None" - - # Note(tangbinhan): used for load_checkpoint - model_config.tensor_parallel_rank = parallel_config.tensor_parallel_rank - model_config.tensor_parallel_degree = parallel_config.tensor_parallel_degree - model_config.is_mtp = speculative_config.is_mtp - moe_config.num_experts = None - - # use the length of tokenizer as the origin vocab size - ori_vocab_size = len(tokenizer) - moe_intermediate_size = (config.get("moe_intermediate_size", None), ) - if isinstance(moe_intermediate_size, list) or isinstance( - moe_intermediate_size, tuple): - moe_intermediate_size = moe_intermediate_size[0] - - num_key_value_heads = config.get("num_key_value_heads", -1) - if num_key_value_heads is None: - num_key_value_heads = -1 + # init args + args = {} + args["max_model_len"] = max_model_len + args["dtype"] = dtype + args["block_size"] = block_size + args["model_name_or_path"] = model_path + args["tensor_parallel_rank"], args["tensor_parallel_size"] = llm_utils.init_dist_env() + args["expert_parallel_size"] = 1 + args["expert_parallel_rank"] = int(args["tensor_parallel_rank"] / + args["tensor_parallel_size"]) + + model_config = ModelConfig(args) + device_config = DeviceConfig(args) + decoding_config = DecodingConfig(args) + speculative_config = SpeculativeConfig(args) + parallel_config = ParallelConfig(args) + load_config = LoadConfig(args) # RL need, some model num_key_value_heads less tensor_parallel_degree, need copy - if num_key_value_heads < tensor_parallel_degree: + if model_config.num_key_value_heads < parallel_config.tensor_parallel_size: logger.warning( - f"key value heads num is {num_key_value_heads}, tensor parallel degree is {tensor_parallel_degree}" + f"key value heads num is {model_config.num_key_value_heads}, tensor parallel degree is {parallel_config.tensor_parallel_size}" ) - num_key_value_heads = tensor_parallel_degree - - if config.get("ffn_hidden_size", None) is not None: - ffn_hidden_size = config["ffn_hidden_size"] - elif config.get("intermediate_size", None) is not None: - ffn_hidden_size = config["intermediate_size"] - else: - ffn_hidden_size = 4 * config["hidden_size"] - if config["hidden_act"].lower() == "swiglu": - if paddle.distributed.get_world_size() > 1: - multiple_of = 8 * config["num_attention_heads"] - else: - multiple_of = 4 * config["num_attention_heads"] - ffn_hidden_size = multiple_of * ( - (int(2 * ffn_hidden_size / 3) + multiple_of - 1) // - multiple_of) - - num_layers = config.get("num_layers", None) or config.get( - "num_hidden_layers", None) - if num_layers is None: - raise ValueError(f"num_layers<{num_layers}> is invalid") - - remove_tail_layer = config.get("remove_tail_layer") - if remove_tail_layer is True: - num_layers -= 1 - elif isinstance(remove_tail_layer, int): - num_layers -= remove_tail_layer - - moe_num_experts = config.get("moe_num_experts", 0) - if isinstance(moe_num_experts, list): - moe_num_experts = max(moe_num_experts) - use_moe = moe_num_experts > 0 - - context = contextlib.nullcontext() + num_key_value_heads = parallel_config.tensor_parallel_size + model_config.num_key_value_heads = num_key_value_heads - if config["hidden_act"].lower() == "swiglu": - model_config.hidden_act = "swiglu" - model_config.ffn_hidden_size = ffn_hidden_size - model_config.max_seq_len = max_model_len - model_config.num_layers = num_layers - model_config.dtype = dtype - parallel_config.block_size = block_size - - parallel_config.msg_queue_id = None - model_config.num_key_value_heads = num_key_value_heads - model_config.return_all_hidden_states = False - speculative_config.draft_type = "None" - model_config.start_layer_index = 0 - if use_moe: - moe_config.num_experts = config.get("moe_num_experts", None) - moe_config.moe_intermediate_size = config.get("moe_intermediate_size", - None) - moe_config.top_k = config.get("moe_topk", 8) - moe_config.moe_num_shared_experts = config.get( - "moe_num_shared_experts", 0) - moe_config.moe_layer_start_index = config.get("moe_layer_start_index", - None) - moe_config.moe_layer_end_index = config.get("moe_layer_end_index", - None) + if getattr(model_config, 'num_hidden_layers', None) is None: + raise ValueError("num_hidden_layers is None") model_config.moe_phase = MoEPhase.PREFILL - model_config.ori_vocab_size = ori_vocab_size + # use the length of tokenizer as the origin vocab size + ori_vocab_size = len(tokenizer) + model_config.vocab_size = ori_vocab_size - quantization_config = config.get("quantization_config", None) + quantization_config = model_config.quantization_config quant_config_name = None if quantization_config is not None and quantization_config.get( @@ -1192,7 +1112,7 @@ def build_stream_line_model( quant_config = quant_cls.from_config(quantization_config) elif quantization != "None": quantization_config = {} - if use_moe and quantization == "wint4": + if quantization == "wint4": quantization_config["dense_quant_type"] = "wint8" quantization_config["moe_quant_type"] = "wint4" quant_config_name = "mix_quant" @@ -1218,13 +1138,11 @@ def build_stream_line_model( speculative_config=speculative_config, device_config=device_config, load_config=load_config, - moe_config=moe_config, + decoding_config=decoding_config, quant_config=quant_config, - kv_cache_config=kv_cache_config, ) - fd_config.parallel_config.max_model_len = max_model_len - fd_config.model_config.rope_theta = rope_theta + context = contextlib.nullcontext() with context: model_cls = ModelRegistry.get_class(model_config.architectures[0]) model = model_cls(fd_config) diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index ba7a5541a5..5151dc6db0 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -22,11 +22,9 @@ import paddle.distributed as dist import paddle.distributed.fleet as fleet -from fastdeploy import envs from fastdeploy.config import (DecodingConfig, DeviceConfig, FDConfig, GraphOptimizationConfig, LoadConfig, - ModelConfig, MoEConfig, MoEPhase, - ParallelConfig, SpeculativeConfig) + ModelConfig, ParallelConfig, SpeculativeConfig) from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue from fastdeploy.inter_communicator import IPCSignal from fastdeploy.model_executor.layers.quantization import \ @@ -75,28 +73,24 @@ def __init__( # Initialize distributed enviroment (self.ranks, self.local_rank) = self.init_distributed_enviroment() - assert self.parallel_config.tensor_parallel_degree * self.parallel_config.expert_parallel_degree == self.ranks + assert self.parallel_config.tensor_parallel_size * self.parallel_config.expert_parallel_size == self.ranks self.fd_config.parallel_config.tensor_parallel_rank = \ - self.local_rank % self.parallel_config.tensor_parallel_degree + self.local_rank % self.parallel_config.tensor_parallel_size self.fd_config.parallel_config.expert_parallel_rank = \ - int(self.local_rank / self.parallel_config.tensor_parallel_degree) + int(self.local_rank / self.parallel_config.tensor_parallel_size) if self.fd_config.parallel_config.use_ep: - self.fd_config.moe_config.num_experts_per_rank = \ - self.fd_config.moe_config.num_experts // self.parallel_config.expert_parallel_degree - self.fd_config.moe_config.num_experts_start_offset = \ - self.fd_config.parallel_config.expert_parallel_rank * self.fd_config.moe_config.num_experts_per_rank + self.fd_config.model_config.num_experts_per_rank = \ + self.fd_config.model_config.moe_num_experts // self.parallel_config.expert_parallel_size + self.fd_config.model_config.num_experts_start_offset = \ + self.fd_config.parallel_config.expert_parallel_rank * self.fd_config.model_config.num_experts_per_rank # For auto TP split - self.fd_config.model_config.tensor_parallel_degree = self.parallel_config.tensor_parallel_degree + self.fd_config.model_config.tensor_parallel_degree = self.parallel_config.tensor_parallel_size self.fd_config.model_config.tensor_parallel_rank = self.parallel_config.tensor_parallel_rank self.fd_config.model_config.use_ep = self.parallel_config.use_ep - if self.fd_config.parallel_config.use_ep: - self.fd_config.model_config.num_experts_per_rank = self.fd_config.moe_config.num_experts_per_rank - self.fd_config.model_config.num_experts_start_offset = self.fd_config.moe_config.num_experts_start_offset - # TODO(gongshaotian): Use worker factory to get worker self.worker = get_worker(fd_config=fd_config, local_rank=self.local_rank, @@ -109,7 +103,7 @@ def __init__( self.task_queue = TaskQueue( address=task_address, is_server=False, - num_client=self.parallel_config.tensor_parallel_degree, + num_client=self.parallel_config.tensor_parallel_size, client_id=self.parallel_config.tensor_parallel_rank, local_data_parallel_id=self.fd_config.parallel_config. expert_parallel_rank) @@ -127,8 +121,8 @@ def init_health_status(self) -> None: # init worker_ready_signal array_size = min( - 8, self.parallel_config.tensor_parallel_degree * - self.parallel_config.expert_parallel_degree) + 8, self.parallel_config.tensor_parallel_size * + self.parallel_config.expert_parallel_size) workers_ready = np.zeros(shape=[array_size], dtype=np.int32) self.worker_ready_signal = IPCSignal( name="worker_ready_signal", @@ -160,7 +154,7 @@ def init_health_status(self) -> None: # init exist_task_signal workers_exist_task = np.zeros( - [self.parallel_config.expert_parallel_degree], dtype=np.int32) + [self.parallel_config.expert_parallel_size], dtype=np.int32) self.exist_task_signal = IPCSignal( name="exist_task_signal", array=workers_exist_task, @@ -170,7 +164,7 @@ def init_health_status(self) -> None: # init exist_swapped_task_signal workers_swapped_task = np.zeros( - shape=[self.parallel_config.expert_parallel_degree], + shape=[self.parallel_config.expert_parallel_size], dtype=np.int32) self.exist_swapped_task_signal = IPCSignal( name="exist_swapped_task_signal", @@ -218,8 +212,8 @@ def event_loop_normal(self) -> None: TODO(gongshaotian): support remote calling of functions that control worker. """ # Currently, only support single node - self.nnode = int((self.parallel_config.tensor_parallel_degree + 7) // 8) - mp_num_per_node = self.parallel_config.tensor_parallel_degree // self.nnode + self.nnode = int((self.parallel_config.tensor_parallel_size + 7) // 8) + mp_num_per_node = self.parallel_config.tensor_parallel_size// self.nnode req_ids = [] while True: if self.local_rank == 0: @@ -228,7 +222,7 @@ def event_loop_normal(self) -> None: else: self.exist_task_signal.value[0] = 0 - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: # Synchronize before updating weights paddle.distributed.barrier() @@ -246,7 +240,7 @@ def event_loop_normal(self) -> None: self.fd_config.parallel_config. expert_parallel_rank] = 1 - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: # Synchronize the signal for other workers # TODO(@wufeisheng): Split TP group and EP group paddle.distributed.barrier() @@ -508,7 +502,7 @@ def parse_args(): parser.add_argument("--enable_expert_parallell", action='store_true', help="enable expert parallell") - parser.add_argument("--ori_vocab_size", type=int, default=None) + parser.add_argument("--vocab_size", type=int, default=None) parser.add_argument("--quantization", type=str, @@ -557,148 +551,29 @@ def parse_args(): def initialize_fd_config(args: argparse.Namespace) -> FDConfig: - """Initialize FDConfig - TODO(gongshaotian): Unified all configs to FDConfig - """ - # NOTE(gongshaotian): From build stream line model - config, _ = ModelConfig.get_config_dict(args.model_name_or_path) - if 'num_experts' in config: - config['moe_num_experts'] = config.pop('num_experts') - - if 'num_experts_per_tok' in config: - config['moe_topk'] = config.pop('num_experts_per_tok') - config["head_dim"] = config.get( - "head_dim", config["hidden_size"] // config["num_attention_heads"]) - config["rope_theta"] = config.get("rope_theta", 10000.0) - model_config = ModelConfig.from_dict(config) - # TODO Set `head_dim` again. Because `ModelConfig` class doesn't support feeding head_dim at all! - model_config.head_dim = config["head_dim"] paddle.set_default_dtype(args.dtype) + model_config = ModelConfig(vars(args)) + device_config = DeviceConfig(vars(args)) + decoding_config = DecodingConfig(vars(args)) + speculative_config = SpeculativeConfig(vars(args)) + parallel_config = ParallelConfig(vars(args)) + load_config = LoadConfig(vars(args)) - device_config = DeviceConfig() - # model_config = ModelConfig() - - decoding_config = DecodingConfig() - - speculative_config = SpeculativeConfig() - parallel_config = ParallelConfig() - load_config = LoadConfig() - moe_config = MoEConfig() graph_opt_config = GraphOptimizationConfig( - args.enable_static_graph_inference, args.use_cudagraph, - args.max_capture_batch_size) - model_config.quantization = args.quantization - - # Update speculate config - speculative_config.method = args.speculative_method - speculative_config.num_speculative_tokens = args.speculative_max_draft_token_num - speculative_config.model_name_or_path = args.speculative_model_name_or_path - speculative_config.quantization = args.speculative_model_quantization - - # Update parallel config - parallel_config.engine_pid = args.engine_pid - parallel_config.model_name_or_path = args.model_name_or_path - parallel_config.max_num_seqs = args.max_num_seqs - parallel_config.max_block_num = args.total_block_num - parallel_config.block_size = args.block_size - parallel_config.pod_ip = args.pod_ip - parallel_config.engine_worker_queue_port = args.engine_worker_queue_port - parallel_config.max_model_len = args.max_model_len - model_config.max_seq_len = args.max_model_len - model_config.max_length = args.max_model_len - parallel_config.device_ids = args.device_ids - parallel_config.dtype = args.dtype - parallel_config.enc_dec_block_num = args.enc_dec_block_num - parallel_config.kv_cache_ratio = args.kv_cache_ratio - parallel_config.first_token_id = args.first_token_id - parallel_config.gpu_memory_utilization = args.gpu_memory_utilization - parallel_config.engine_pid = args.engine_pid - parallel_config.do_profile = args.do_profile - parallel_config.dynamic_load_weight = args.dynamic_load_weight - parallel_config.pad_token_id = args.pad_token_id - parallel_config.eos_tokens_lens = args.eos_tokens_lens - parallel_config.enable_chunked_prefill = args.enable_chunked_prefill - parallel_config.max_num_batched_tokens = args.max_num_batched_tokens - parallel_config.enable_prefix_caching = args.enable_prefix_caching - - parallel_config.use_ep = args.enable_expert_parallell - parallel_config.tensor_parallel_degree = args.tensor_parallel_size - parallel_config.expert_parallel_degree = args.expert_parallel_size - parallel_config.splitwise_role = args.splitwise_role - load_config.use_fastsafetensor = int(envs.FD_USE_FASTSAFETENSOR) == 1 - - parallel_config.guided_decoding_backend = args.guided_decoding_backend - parallel_config.disable_any_whitespace = args.disable_any_whitespace + args.enable_static_graph_inference, + args.max_capture_batch_size, + vars(args)) logger.info(f"parallel_config.use_ep {parallel_config.use_ep}") logger.info( - f"parallel_config.tensor_parallel_degree {parallel_config.tensor_parallel_degree}" + f"parallel_config.tensor_parallel_size {parallel_config.tensor_parallel_size}" ) logger.info(f"args.splitwise_role {args.splitwise_role}") - if args.splitwise_role == "mixed": - parallel_config.moe_phase = MoEPhase.PREFILL - elif args.splitwise_role == "prefill": - parallel_config.moe_phase = MoEPhase.PREFILL - elif args.splitwise_role == "decode": - parallel_config.moe_phase = MoEPhase.DECODER - else: - raise NotImplementedError - - num_key_value_heads = config.get("num_key_value_heads", -1) - if num_key_value_heads is None: - num_key_value_heads = -1 + if getattr(model_config, 'num_hidden_layers', None) is None: + raise ValueError("num_hidden_layers is None") - if config.get("ffn_hidden_size", None) is not None: - ffn_hidden_size = config["ffn_hidden_size"] - elif config.get("intermediate_size", None) is not None: - ffn_hidden_size = config["intermediate_size"] - else: - ffn_hidden_size = 4 * config["hidden_size"] - if config["hidden_act"].lower() == "swiglu": - if paddle.distributed.get_world_size() > 1: - multiple_of = 8 * config["num_attention_heads"] - else: - multiple_of = 4 * config["num_attention_heads"] - ffn_hidden_size = multiple_of * ( - (int(2 * ffn_hidden_size / 3) + multiple_of - 1) // - multiple_of) - - num_layers = config.get("num_layers", None) or config.get( - "num_hidden_layers", None) - if num_layers is None: - raise ValueError(f"num_layers<{num_layers}> is invalid") - - use_moe = config.get("moe_layer_start_index", num_layers) < num_layers - - model_config.ffn_hidden_size = ffn_hidden_size - model_config.num_layers = num_layers - - model_config.num_key_value_heads = num_key_value_heads - model_config.start_layer_index = config.get("start_layer_index", 0) - moe_config.num_experts = config.get("moe_num_experts", None) - moe_config.moe_intermediate_size = config.get("moe_intermediate_size", - None) - moe_config.top_k = config.get("moe_k", config.get("moe_topk", 8)) - moe_config.moe_num_shared_experts = config.get("moe_num_shared_experts", 0) - moe_config.moe_layer_start_index = config.get("moe_layer_start_index", 0) - - moe_config.num_max_dispatch_tokens_per_rank = config.get( - "num_max_dispatch_tokens_per_rank", 256) - moe_config.moe_use_aux_free = config.get("moe_use_aux_free", False) - - model_config.ori_vocab_size = config.get("vocab_size", -1) - if "Ernie4_5_ForCausalLM" in config.get("architectures"): - model_config.ori_vocab_size = args.ori_vocab_size - - if "DeepseekV3ForCausalLM" in config.get("architectures"): - from paddleformers.transformers import AutoConfig - model_config.deepseekv3 = AutoConfig.from_pretrained( - args.model_name_or_path) - - #TODO(@yuanrisheng): kv_cache quant config can only be - # stored in model config file, which should be unified - quantization_config = config.get("quantization_config", None) + quantization_config = model_config.quantization_config if not model_config.is_quantized: if quantization_config is not None: if "kv_cache_quant_type" not in quantization_config: @@ -718,9 +593,9 @@ def initialize_fd_config(args: argparse.Namespace) -> FDConfig: quant_config_name = args.quantization quantization_config["quantization"] = quant_config_name # use some trick code for ernie model and will unify it in future. - is_ernie = "Ernie4_5_ForCausalLM" in config.get("architectures") or \ - "Ernie4_5_MoeForCausalLM" in config.get("architectures") - if use_moe and quant_config_name == "wint4" and is_ernie: + is_ernie = "Ernie4_5_ForCausalLM" in model_config.architectures or \ + "Ernie4_5_MoeForCausalLM" in model_config.architectures + if quant_config_name == "wint4" and is_ernie: quantization_config["dense_quant_type"] = "wint8" quantization_config["moe_quant_type"] = "wint4" quantization_config["quantization"] = "mix_quant" @@ -750,11 +625,6 @@ def initialize_fd_config(args: argparse.Namespace) -> FDConfig: "No quantization config found and use original weight and act dtype." ) - model_config.architectures = config.get("architectures") - - logger.info("===========load_config==============") - load_config.dynamic_load_weight = args.dynamic_load_weight - load_config.load_strategy = args.load_strategy logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}") logger.info(f"- Load strategy: {load_config.load_strategy}") @@ -763,7 +633,6 @@ def initialize_fd_config(args: argparse.Namespace) -> FDConfig: speculative_config=speculative_config, device_config=device_config, load_config=load_config, - moe_config=moe_config, decoding_config=decoding_config, quant_config=quant_config, graph_opt_config=graph_opt_config) diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index b075356f99..ac2752aa40 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -297,7 +297,7 @@ def process_prefill_inputs(self, req_dicts: List[Request]): "min_tokens", 1) self.share_inputs["max_dec_len"][idx:idx + 1] = request.get( - "max_tokens", self.model_config.max_length) + "max_tokens", self.model_config.max_model_len) self.share_inputs["stop_flags"][idx:idx + 1] = False self.share_inputs["first_token_ids"][ @@ -367,11 +367,11 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["min_dec_len"] = paddle.full( [max_num_seqs, 1], self.model_config.min_length, dtype='int64') self.share_inputs["max_dec_len"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_length, dtype='int64') + [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64') self.share_inputs["min_length"] = paddle.full( [max_num_seqs, 1], self.model_config.min_length, dtype='int64') self.share_inputs["max_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_length, dtype='int64') + [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64') self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype='int32') @@ -553,7 +553,7 @@ def initialize_kv_cache(self) -> None: kv_cache_shape = self.attn_backends[0].get_kv_cache_shape( max_num_blocks=max_block_num) - for i in range(self.model_config.num_layers): + for i in range(self.model_config.num_hidden_layers): cache_kvs["key_caches_{}".format(i)] = paddle.full( shape=kv_cache_shape, fill_value=0, @@ -576,10 +576,10 @@ def initialize_attn_backend(self) -> None: assert len(self.attn_backends) == 0 # TODO(gongshaotian): Get rank from config - num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree + num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size self.model_config.kv_num_heads = int( self.model_config.num_key_value_heads - ) // self.parallel_config.tensor_parallel_degree + ) // self.parallel_config.tensor_parallel_size head_dim = self.model_config.head_dim # Get the attention backend @@ -780,7 +780,7 @@ def cal_theortical_kvcache(self): required_memory = ( byte_of_dtype * 2 * # k + v (self.parallel_config.block_size * hidden_dim) * - self.model_config.num_layers) + self.model_config.num_hidden_layers) return required_memory def update_share_input_block_num(self, num_gpu_blocks: int) -> None: From 47b155dd50d7c2d07ddbd78d0a784ab71f748017 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Wed, 9 Jul 2025 12:25:06 +0000 Subject: [PATCH 02/19] fix vl --- fastdeploy/config.py | 8 +++++ fastdeploy/engine/engine.py | 4 +-- .../model_executor/models/deepseek_v3.py | 2 +- .../model_executor/models/ernie4_5_moe.py | 2 +- .../model_executor/models/ernie4_5_mtp.py | 2 +- .../models/ernie4_5_vl/ernie4_5_vl_moe.py | 2 +- fastdeploy/model_executor/models/qwen2.py | 2 +- fastdeploy/model_executor/models/qwen3.py | 2 +- fastdeploy/model_executor/models/qwen3moe.py | 2 +- fastdeploy/worker/vl_gpu_model_runner.py | 34 +++++++------------ fastdeploy/worker/worker_process.py | 2 +- 11 files changed, 30 insertions(+), 32 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 51645ce49a..998fd18937 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -79,6 +79,10 @@ def __init__( "initializer_range":0.02, "max_position_embeddings":512, "quantization_config":None, + "use_recompute_resampler":False, + "use_temporal_conv":True, + "resampler_fuse_rms_norm":False, + "freq_allocation":20, } for key, value in args.items(): @@ -98,6 +102,10 @@ def __init__( if not hasattr(self, "head_dim"): self.head_dim = self.hidden_size // self.num_attention_heads + if hasattr(self, "vision_config"): + self.vision_config = PretrainedConfig.from_dict(self.vision_config) + + self.ori_vocab_size = self.vocab_size if "Ernie4_5_ForCausalLM" in self.architectures: self.vocab_size = args["vocab_size"] diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index e0e95e4bf9..f46e3010d4 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -997,7 +997,7 @@ def _start_worker_service(self): worker_path = "../worker/vl_worker_process.py" py_script = os.path.join(current_dir_path, worker_path) - vocab_size = ( + ori_vocab_size = ( len(self.data_processor.tokenizer.sp_model) if hasattr(self.data_processor.tokenizer, 'sp_model') else len(self.data_processor.tokenizer.vocab) @@ -1025,7 +1025,7 @@ def _start_worker_service(self): f" --kv_cache_ratio {self.cfg.cache_config.kv_cache_ratio}" f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}" f" --quantization {self.cfg.model_config.quantization}" - f" --vocab_size {vocab_size}" + f" --ori_vocab_size {ori_vocab_size}" f" --speculative_method {self.cfg.speculative_config.method}" f" --speculative_max_draft_token_num {self.cfg.speculative_config.num_speculative_tokens}" f" --speculative_model_name_or_path {self.cfg.speculative_config.model_name_or_path}" diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py index 9844eeb6b1..8437117200 100644 --- a/fastdeploy/model_executor/models/deepseek_v3.py +++ b/fastdeploy/model_executor/models/deepseek_v3.py @@ -620,7 +620,7 @@ def __init__(self, fd_config: FDConfig): """ super().__init__(fd_config) self.model = DeepSeekV3Model(fd_config) - self.ori_vocab_size = fd_config.model_config.vocab_size + self.ori_vocab_size = fd_config.model_config.ori_vocab_size self.lm_head = ParallelLMHead( fd_config, embedding_dim=fd_config.model_config.hidden_size, diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index f6cf1cfa1d..417c3b5836 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -419,7 +419,7 @@ def __init__(self, fd_config: FDConfig): self.fd_config = fd_config self.model = Ernie4_5_Model(fd_config=fd_config) - self.ori_vocab_size = fd_config.model_config.vocab_size + self.ori_vocab_size = fd_config.model_config.ori_vocab_size self.lm_head = ParallelLMHead( fd_config=fd_config, diff --git a/fastdeploy/model_executor/models/ernie4_5_mtp.py b/fastdeploy/model_executor/models/ernie4_5_mtp.py index 5e032cdd5c..444b135449 100644 --- a/fastdeploy/model_executor/models/ernie4_5_mtp.py +++ b/fastdeploy/model_executor/models/ernie4_5_mtp.py @@ -351,7 +351,7 @@ def __init__(self, fd_config: FDConfig): self.fd_config = fd_config self.model = Ernie4_5_MTPModel(fd_config=fd_config) - self.ori_vocab_size = fd_config.model_config.vocab_size + self.ori_vocab_size = fd_config.model_config.ori_vocab_size self.lm_head = fd_config.speculative_config.sharing_model.lm_head self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index 68d059906c..57d995561d 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -468,7 +468,7 @@ def __init__(self, fd_config: FDConfig): self.model = Ernie4_5_VLModel(fd_config=fd_config) - self.ori_vocab_size = fd_config.model_config.vocab_size + self.ori_vocab_size = fd_config.model_config.ori_vocab_size self.lm_head = ParallelLMHead( fd_config=fd_config, diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py index 760f39e0a3..ee5f1d98c4 100644 --- a/fastdeploy/model_executor/models/qwen2.py +++ b/fastdeploy/model_executor/models/qwen2.py @@ -304,7 +304,7 @@ def __init__(self, fd_config: FDConfig): self.model = Qwen2Model(fd_config=fd_config) - self.ori_vocab_size = fd_config.model_config.vocab_size + self.ori_vocab_size = fd_config.model_config.ori_vocab_size self.lm_head = ParallelLMHead( fd_config=fd_config, diff --git a/fastdeploy/model_executor/models/qwen3.py b/fastdeploy/model_executor/models/qwen3.py index 78eeb46632..81c7ce063d 100644 --- a/fastdeploy/model_executor/models/qwen3.py +++ b/fastdeploy/model_executor/models/qwen3.py @@ -239,7 +239,7 @@ def __init__(self, fd_config: FDConfig): self.model = Qwen3Model(fd_config=fd_config) - self.ori_vocab_size = fd_config.model_config.vocab_size + self.ori_vocab_size = fd_config.model_config.ori_vocab_size self.lm_head = ParallelLMHead( fd_config=fd_config, diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 705ee81d7f..93114e9bf1 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -358,7 +358,7 @@ def __init__(self, fd_config: FDConfig): self.model = Qwen3MoeModel(fd_config) - self.ori_vocab_size = fd_config.model_config.vocab_size + self.ori_vocab_size = fd_config.model_config.ori_vocab_size self.lm_head = ParallelLMHead( fd_config, diff --git a/fastdeploy/worker/vl_gpu_model_runner.py b/fastdeploy/worker/vl_gpu_model_runner.py index 66be1d24fe..da6f8a3369 100644 --- a/fastdeploy/worker/vl_gpu_model_runner.py +++ b/fastdeploy/worker/vl_gpu_model_runner.py @@ -35,8 +35,6 @@ from fastdeploy.model_executor.layers.sample.sampler import Sampler from fastdeploy.model_executor.models.ernie4_5_moe import \ Ernie4_5_PretrainedModel -from fastdeploy.model_executor.models.ernie4_5_vl.configuration import \ - Ernie4_5_VLMoeConfig from fastdeploy.model_executor.models.ernie4_5_vl.dfnrope import \ DFNRopeVisionTransformerConfig from fastdeploy.model_executor.models.ernie4_5_vl.dfnrope.modeling import \ @@ -194,13 +192,11 @@ def _load_model( if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.unk_token - config = Ernie4_5_VLMoeConfig.from_pretrained( - self.args.llm_model_name_or_path, - tensor_parallel_degree=self.tensor_parallel_degree, - tensor_parallel_rank=self.tensor_parallel_rank, - moe_group="dummy", - ) - self.model_cfg = config + self.model_cfg = ModelConfig(vars(self.args)) + self.model_cfg.tensor_parallel_degree=self.tensor_parallel_degree + self.model_cfg.tensor_parallel_rank=self.tensor_parallel_rank + self.model_cfg.moe_group="dummy" + config = self.model_cfg if self.is_safetensors_model: meta_json = os.path.join(self.args.model_name_or_path, "model.safetensors.index.json") @@ -279,7 +275,7 @@ def _load_model( self.model.eval() self.set_state_dict(self.args) - fd_config.parallel_config.max_model_len = fd_config.model_config.max_seq_len + fd_config.parallel_config.max_model_len = fd_config.model_config.max_model_len self.fd_config = fd_config attn_backend_cls = get_attention_backend() num_heads = self.fd_config.model_config.num_attention_heads // \ @@ -338,14 +334,10 @@ def _init_kvcache(self): """ cache_kvs = {} total_block_num = self.num_gpu_blocks - num_layers = self.model_cfg.get("num_layers", - None) or self.model_cfg.get( - "num_hidden_layers", None) + num_layers = self.model_cfg.num_hidden_layers + + kv_num_head = self.model_cfg.num_key_value_heads if self.model_cfg.num_key_value_heads != -1 else self.model_cfg.num_attention_heads - kv_num_head = self.model_cfg.get( - "num_key_value_heads", - self.model_cfg.num_attention_heads, - ) kv_num_head = kv_num_head // self.tensor_parallel_degree self.model_cfg.kv_num_head = kv_num_head @@ -459,7 +451,7 @@ def vit_load( raise ValueError(f"No such a file {rank_model_path}") @paddle.no_grad() - def inject_pp_vision_model(self, args: argparse.Namespace, cfg: Ernie4_5_VLMoeConfig): + def inject_pp_vision_model(self, args: argparse.Namespace, cfg: ModelConfig): """ Inject pp vision model """ @@ -931,9 +923,7 @@ def _cal_theortical_kvcache(self): """ Calculate the size of kvcache for computational theory """ - num_layers = self.model_cfg.get("num_layers", - None) or self.model_cfg.get( - "num_hidden_layers", None) + num_layers = self.model_cfg.num_hidden_layers byte_of_cache = 2 # support c8 c4 @@ -1095,7 +1085,7 @@ def build_stream_line_model( model_config.moe_phase = MoEPhase.PREFILL # use the length of tokenizer as the origin vocab size ori_vocab_size = len(tokenizer) - model_config.vocab_size = ori_vocab_size + model_config.ori_vocab_size = ori_vocab_size quantization_config = model_config.quantization_config diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 5151dc6db0..0c4341dde5 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -502,7 +502,7 @@ def parse_args(): parser.add_argument("--enable_expert_parallell", action='store_true', help="enable expert parallell") - parser.add_argument("--vocab_size", type=int, default=None) + parser.add_argument("--ori_vocab_size", type=int, default=None) parser.add_argument("--quantization", type=str, From a74ff1d58f2b1fffe64f5a6362fe3a5654c8abf4 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Wed, 9 Jul 2025 12:26:16 +0000 Subject: [PATCH 03/19] delete config --- .../models/ernie4_5_vl/configuration.py | 167 ------------------ 1 file changed, 167 deletions(-) delete mode 100644 fastdeploy/model_executor/models/ernie4_5_vl/configuration.py diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/configuration.py b/fastdeploy/model_executor/models/ernie4_5_vl/configuration.py deleted file mode 100644 index c7812e258b..0000000000 --- a/fastdeploy/model_executor/models/ernie4_5_vl/configuration.py +++ /dev/null @@ -1,167 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import copy - -from paddleformers.transformers.configuration_utils import PretrainedConfig - -from .dfnrope.modeling import DFNRopeVisionTransformerConfig - -__all__ = [ - "Ernie4_5_VLMoeConfig", -] - - -class Ernie4_5_VLMoeConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`~ErnieModel`]. It is used to instantiate an Ernie - model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the Ernie-7B. - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - Args: - vocab_size (`int`, *optional*, defaults to 32000): - Vocabulary size of the Ernie model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`~ErnieModel`] or [`~TFErnieModel`]. - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 11008): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer encoder. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-12): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - tie_word_embeddings(`bool`, *optional*, defaults to `False`): - Whether to tie weight embeddings - Example: - ```python - >>> from paddleformers.transformer import ErnieModel, ErnieConfig - - >>> # Initializing a Ernie ernie-7b style configuration - >>> configuration = ErnieConfig() - - >>> # Initializing a model from the ernie-7b style configuration - >>> model = ErnieModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "erniemoevl" - attribute_map = { - "n_positions": "max_position_embeddings", - "n_embd": "hidden_size", - "n_layer": "num_hidden_layers", - "n_head": "num_attention_heads", - "n_inner": "intermediate_size", - "activation_function": "hidden_act", - } - - def __init__( - self, - vision_config=None, - im_patch_id=None, - pixel_hidden_size=None, # None for fuyu - modality_detach=False, - temporal_conv_size=2, - spatial_conv_size=2, - mm_vocab_size=0, # vocab for mm specialtokens - max_text_id=None, - use_temporal_conv=True, - moe_use_size_all2all=False, - moe_num_attn_experts=False, - moe_dense_experts_token_type_id: int = 3, - moe_use_hard_gate: bool = True, - moe_fuse_experts: bool = False, - moe_use_token_type_bias: bool = False, - disable_ffn_model_parallel=False, - fuse_attn_ffn=True, - rope_3d=True, - freq_allocation=20, - using_precision_check=False, - use_recompute_resampler=False, - resampler_fuse_rms_norm=False, - moe_layer_feed_fake_token=False, - moe_num_experts=0, - **kwargs, - ): - super().__init__(**kwargs) - self.vision_config = DFNRopeVisionTransformerConfig( - **vision_config) if vision_config else None - self.im_patch_id = im_patch_id - self.pixel_hidden_size = pixel_hidden_size - self.modality_detach = modality_detach - self.temporal_conv_size = temporal_conv_size - self.spatial_conv_size = spatial_conv_size - self.mm_vocab_size = mm_vocab_size - self.max_text_id = max_text_id - self.use_temporal_conv = use_temporal_conv - - self.moe_use_size_all2all = moe_use_size_all2all - self.moe_num_attn_experts = moe_num_attn_experts - self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id - self.moe_use_hard_gate = moe_use_hard_gate - self.moe_fuse_experts = moe_fuse_experts - self.moe_use_token_type_bias = moe_use_token_type_bias - self.disable_ffn_model_parallel = disable_ffn_model_parallel - - self.fuse_attn_ffn = fuse_attn_ffn - self.rope_3d = rope_3d - self.freq_allocation = freq_allocation - self.using_precision_check = using_precision_check - self.use_recompute_resampler = use_recompute_resampler - self.resampler_fuse_rms_norm = resampler_fuse_rms_norm - self.moe_layer_feed_fake_token = moe_layer_feed_fake_token - self.moe_num_experts = moe_num_experts - - @property - def multimodel_experts(self) -> bool: - """是否有多种类型的experts.""" - return isinstance(self.moe_num_experts, - (tuple, list)) and len(self.moe_num_experts) > 1 - - @property - def use_moe(self) -> bool: - """ - Check if model is using MoE architecture. - - Returns: - bool: True if moe_num_experts > 0, False otherwise - """ - return sum( - self.moe_num_experts - ) > 0 if self.multimodel_experts else self.moe_num_experts > 0 - - def to_dict(self, saving_file=False): - """to_dict""" - output = copy.deepcopy(self.__dict__) - if self.vision_config: - output["vision_config"] = ( - self.vision_config.to_diff_dict() if isinstance( - self.vision_config, - (DFNRopeVisionTransformerConfig)) else self.vision_config) - - output["model_type"] = self.__class__.model_type - return output From cf394db0b1a5e79170373557f55cfcca024f1e7e Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 10 Jul 2025 03:43:11 +0000 Subject: [PATCH 04/19] fix --- fastdeploy/config.py | 4 ++-- fastdeploy/worker/vl_gpu_model_runner.py | 12 +++++++----- fastdeploy/worker/worker_process.py | 6 +++--- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 1ce3fa6ad5..a33e132b8e 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -187,7 +187,7 @@ def __init__( else: raise NotImplementedError # enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce). - enable_custom_all_reduce: str = "store_true" + self.enable_custom_all_reduce: bool = False @dataclass class SpeculativeConfig: @@ -225,7 +225,7 @@ def __init__( # During benchmarking, we need to enforce that the number of accepted tokens is 1. # This means no tokens from MTP are accepted. # This ensures that the specified simulation acceptance rate is not affected. - benchmark_mode: bool = False + self.benchmark_mode: bool = False for key, value in args.items(): if hasattr(self, key): diff --git a/fastdeploy/worker/vl_gpu_model_runner.py b/fastdeploy/worker/vl_gpu_model_runner.py index f34bd5d135..d81f0021a1 100644 --- a/fastdeploy/worker/vl_gpu_model_runner.py +++ b/fastdeploy/worker/vl_gpu_model_runner.py @@ -25,9 +25,10 @@ from paddleformers.transformers.model_utils import load_tp_checkpoint from safetensors import safe_open -from fastdeploy.config import (DecodingConfig, DeviceConfig, FDConfig, GraphOptimizationConfig, - LoadConfig, ModelConfig, MoEPhase, - ParallelConfig, SpeculativeConfig) +from fastdeploy.config import (DecodingConfig, DeviceConfig, FDConfig, + GraphOptimizationConfig, LoadConfig, + ModelConfig, MoEPhase, ParallelConfig, + SpeculativeConfig) from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.input.mm_processor import DataProcessor from fastdeploy.model_executor.layers.attention import get_attention_backend @@ -266,8 +267,9 @@ def _load_model( self.image_preprocess = image_preprocess graph_opt_config = GraphOptimizationConfig( - self.args.enable_static_graph_inference, self.args.use_cudagraph, - self.args.max_capture_batch_size) + self.args.enable_static_graph_inference, + self.args.max_capture_batch_size, + vars(self.args)) fd_config, self.model = build_stream_line_model( self.args.model_name_or_path, diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 7b2101efe2..a904a3a138 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -615,9 +615,9 @@ def initialize_fd_config(args: argparse.Namespace) -> FDConfig: if quantization_config is not None: quant_config_name = quantization_config["quantization"] - elif getattr(config_or_args, 'quantization', None) != "None": + elif args.quantization != "None": quantization_config = {} - quant_config_name = getattr(config_or_args, 'quantization', None) + quant_config_name = args.quantization quantization_config["quantization"] = quant_config_name # Special handling for Ernie models is_ernie = "Ernie4_5_ForCausalLM" in model_config.architectures or \ @@ -647,7 +647,7 @@ def initialize_fd_config(args: argparse.Namespace) -> FDConfig: logger.info( "Model Status: Original (will apply online quantization)") - logger.info(f"Quantization Method: {getattr(config_or_args, 'quantization', 'None')}") + logger.info(f"{quantization_config}") else: logger.info( "No quantization config found and use original weight and act dtype." From c43365baca7b07fb5be5cbfde4092bac0ae88ff0 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 10 Jul 2025 03:48:10 +0000 Subject: [PATCH 05/19] perfect code --- fastdeploy/config.py | 1 - fastdeploy/worker/worker_process.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index a33e132b8e..8f1672872c 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -115,7 +115,6 @@ def __init__( self, args, ): - self.block_size = 16 # The block size for processing. self.sequence_parallel = False # Whether to enable sequence parallelism. self.use_ep = False # Whether to enable Expert Parallelism self.moe_phase = MoEPhase.PREFILL # Generation phase diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index a904a3a138..95a3dcbae5 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -225,7 +225,7 @@ def event_loop_normal(self) -> None: """ # Currently, only support single node self.nnode = int((self.parallel_config.tensor_parallel_size + 7) // 8) - mp_num_per_node = self.parallel_config.tensor_parallel_size// self.nnode + mp_num_per_node = self.parallel_config.tensor_parallel_size // self.nnode req_ids = [] while True: if self.local_rank == 0: From b680899357d258b8159de251df5ebfccff98d534 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 10 Jul 2025 06:43:02 +0000 Subject: [PATCH 06/19] fix ci --- fastdeploy/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 8f1672872c..210d548b59 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -106,8 +106,8 @@ def __init__( self.vision_config = PretrainedConfig.from_dict(self.vision_config) self.ori_vocab_size = self.vocab_size - if "Ernie4_5_ForCausalLM" in self.architectures: - self.vocab_size = args["vocab_size"] + if "Ernie4_5_ForCausalLM" in self.architectures or "Ernie4_5_MoeForCausalLM" in self.architectures: + self.ori_vocab_size = args["ori_vocab_size"] class ParallelConfig: """Configuration for the distributed execution.""" From be39406ae7aaebcad2d8a6d4b9dfca6d568702c9 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 10 Jul 2025 07:22:32 +0000 Subject: [PATCH 07/19] fix xpu --- fastdeploy/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 210d548b59..85d5634f65 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -83,6 +83,7 @@ def __init__( "use_temporal_conv":True, "resampler_fuse_rms_norm":False, "freq_allocation":20, + "tie_word_embeddings":False, } for key, value in args.items(): From 137eaf14bfaa02eb5713132ccafcc1ff6aa4d88f Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 10 Jul 2025 08:03:56 +0000 Subject: [PATCH 08/19] fix xpu --- fastdeploy/engine/engine.py | 4 ++-- .../layers/attention/iluvatar_attn_backend.py | 2 +- .../layers/backends/gcu/attention/flash_attn_backend.py | 2 +- .../backends/gcu/attention/mem_efficient_attn_backend.py | 2 +- fastdeploy/model_executor/load_weight_utils.py | 2 +- fastdeploy/model_executor/models/deepseek_v3.py | 2 +- fastdeploy/model_executor/models/ernie4_5_moe.py | 2 +- fastdeploy/model_executor/models/ernie4_5_mtp.py | 2 +- fastdeploy/model_executor/models/qwen2.py | 2 +- fastdeploy/model_executor/models/qwen3.py | 2 +- fastdeploy/model_executor/models/qwen3moe.py | 2 +- fastdeploy/rl/rollout_model.py | 2 +- fastdeploy/worker/gcu_model_runner.py | 6 +++--- fastdeploy/worker/iluvatar_model_runner.py | 6 +++--- 14 files changed, 19 insertions(+), 19 deletions(-) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 627e71e2cc..b36673b4de 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -1194,10 +1194,10 @@ def detect_thread(): r'set state for layer (\d+)', line)): progress = eval(match.group( - 1)) * 1.0 / self.cfg.model_config.num_layers + 1)) * 1.0 / self.cfg.model_config.num_hidden_layers self.worker_init_status["layer_loadding"] = progress if self.worker_init_status[ - "layer_loadding"] == self.cfg.model_config.num_layers - 1: + "layer_loadding"] == self.cfg.model_config.num_hidden_layers - 1: self.worker_init_status["finished"] = True self.checking_worker_status_thread = threading.Thread( diff --git a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py index 43e0341940..c93937950f 100644 --- a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py @@ -101,7 +101,7 @@ def __init__(self, llm_config: FDConfig, kv_num_heads: int, num_heads: int, self.head_dim = head_dim # note: scale need to change if using MLA self.attention_metadata.scale = 1.0 / sqrt(head_dim) - self.num_layers = llm_config.model_config.num_layers + self.num_layers = llm_config.model_config.num_hidden_layers self.record_block_table_metadata = {} self.only_use_flash_attn = int( os.getenv("FD_ILUVATAR_ONLY_USE_FLASH_ATTN", 0)) == 1 diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py index 56870de82e..0c51f6c052 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py @@ -91,7 +91,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, self.num_heads = num_heads self.head_dim = head_dim self.scaling = 1.0 / (self.head_dim**0.5) - self.num_layers = fd_config.model_config.num_layers + self.num_layers = fd_config.model_config.num_hidden_layers self.position_ids_base = paddle.arange(self.max_seq_len) # TODO(zhengjun): Need to adapt the allocation logic and diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py index bc5d8f1513..d7fb550106 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py @@ -90,7 +90,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, self.num_heads = num_heads self.head_dim = head_dim self.scaling = 1.0 / (self.head_dim**0.5) - self.num_layers = fd_config.model_config.num_layers + self.num_layers = fd_config.model_config.num_hidden_layers self.position_ids_base = paddle.arange(self.max_seq_len) # TODO(zhengjun): Need to adapt the allocation logic and diff --git a/fastdeploy/model_executor/load_weight_utils.py b/fastdeploy/model_executor/load_weight_utils.py index 36b09cf30b..9dbae9f27f 100644 --- a/fastdeploy/model_executor/load_weight_utils.py +++ b/fastdeploy/model_executor/load_weight_utils.py @@ -43,7 +43,7 @@ def load_ep_checkpoint(model_path: str, filtered_map = {k: v for k, v in weight_list.items() if "experts" not in k} num_local_ffn_keys = [] - for i in range(config.moe_layer_start_index, config.num_layers): + for i in range(config.moe_layer_start_index, config.num_hidden_layers): for j in range( config.num_experts_start_offset, config.num_experts_start_offset + config.num_experts_per_rank, diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py index 8437117200..963daef020 100644 --- a/fastdeploy/model_executor/models/deepseek_v3.py +++ b/fastdeploy/model_executor/models/deepseek_v3.py @@ -758,5 +758,5 @@ def get_tensor_parallel_split_mappings(num_layers): return final_actions - mappings = get_tensor_parallel_split_mappings(config.num_layers) + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers) return mappings diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index 417c3b5836..9384d90d71 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -618,7 +618,7 @@ def get_tensor_parallel_split_mappings(num_layers, moe_num_experts, elif isinstance(config.moe_layer_start_index, int): moe_layer_start_index = config.moe_layer_start_index - mappings = get_tensor_parallel_split_mappings(config.num_layers, + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, moe_num_experts, moe_layer_start_index, config.prefix_name) diff --git a/fastdeploy/model_executor/models/ernie4_5_mtp.py b/fastdeploy/model_executor/models/ernie4_5_mtp.py index 444b135449..35e64ddd62 100644 --- a/fastdeploy/model_executor/models/ernie4_5_mtp.py +++ b/fastdeploy/model_executor/models/ernie4_5_mtp.py @@ -237,7 +237,7 @@ def get_tensor_parallel_split_mappings(num_layers, moe_num_experts, moe_num_experts = 0 mappings = get_tensor_parallel_split_mappings( - config.num_layers, + config.num_hidden_layers, moe_num_experts, config.moe_layer_start_index, ) diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py index 114be5141a..9e99ed62a6 100644 --- a/fastdeploy/model_executor/models/qwen2.py +++ b/fastdeploy/model_executor/models/qwen2.py @@ -427,6 +427,6 @@ def get_tensor_parallel_split_mappings(num_layers): return final_actions - mappings = get_tensor_parallel_split_mappings(config.num_layers) + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers) return mappings diff --git a/fastdeploy/model_executor/models/qwen3.py b/fastdeploy/model_executor/models/qwen3.py index 0bfea17448..a9153a870f 100644 --- a/fastdeploy/model_executor/models/qwen3.py +++ b/fastdeploy/model_executor/models/qwen3.py @@ -357,5 +357,5 @@ def get_tensor_parallel_split_mappings(num_layers): return final_actions - mappings = get_tensor_parallel_split_mappings(config.num_layers) + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers) return mappings diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 93114e9bf1..5960a750de 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -502,7 +502,7 @@ def get_tensor_parallel_split_mappings(num_layers, moe_num_experts): f"Not support type of moe_num_experts [{type(config.moe_num_experts)}]" ) - mappings = get_tensor_parallel_split_mappings(config.num_layers, + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, moe_num_experts) return mappings diff --git a/fastdeploy/rl/rollout_model.py b/fastdeploy/rl/rollout_model.py index b31105bab8..eeab79d675 100644 --- a/fastdeploy/rl/rollout_model.py +++ b/fastdeploy/rl/rollout_model.py @@ -278,7 +278,7 @@ def _add_layer_mappings(layer_idx): f"{train_base_name}.{layer_idx}.mlp.down_proj.{ph}" for layer_idx in range( - self.fd_config.model_config.num_layers): + self.fd_config.model_config.num_hidden_layers): _add_layer_mappings(layer_idx) return infer_to_train diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 811b2b6916..c54fad95b3 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -673,7 +673,7 @@ def initialize_kv_cache(self) -> None: "prefix_caching is not support by GCUModelRunner." ) else: - for i in range(self.model_config.num_layers): + for i in range(self.model_config.num_hidden_layers): cache_kvs["key_caches_{}".format(i)] = paddle.full( shape=kv_cache_shape, @@ -1186,11 +1186,11 @@ def cal_theortical_kvcache(self): byte_of_dtype = 2 hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads - num_layers = self.model_config.num_layers + \ + num_layers = self.model_config.num_hidden_layers + \ self.speculative_config.num_gpu_block_expand_ratio if \ self.speculative_method in [ "mtp" - ] else self.model_config.num_layers + ] else self.model_config.num_hidden_layers required_memory = ( byte_of_dtype * 2 * # k + v (self.parallel_config.block_size * hidden_dim) * num_layers) diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index 42aadd9b6c..7e76ce57ac 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -645,7 +645,7 @@ def initialize_kv_cache(self) -> None: or self.parallel_config.splitwise_role != "mixed"): raise NotImplementedError("Iluvatar does not support yet") else: - for i in range(self.model_config.num_layers): + for i in range(self.model_config.num_hidden_layers): cache_kvs["key_caches_{}".format(i)] = paddle.full( shape=kv_cache_shape, @@ -1142,11 +1142,11 @@ def cal_theortical_kvcache(self): hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads # NOTE(liuzichang): Implement multi-layer MTP architecture in the future - num_layers = self.model_config.num_layers + \ + num_layers = self.model_config.num_hidden_layers + \ self.speculative_config.num_gpu_block_expand_ratio if \ self.speculative_method in [ "mtp" - ] else self.model_config.num_layers + ] else self.model_config.num_hidden_layers required_memory = ( byte_of_dtype * 2 * # k + v (self.parallel_config.block_size * hidden_dim) * num_layers) From 6b7c5016184c9fb1e19d6e2de65d4da2b747b6f8 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 10 Jul 2025 12:55:07 +0000 Subject: [PATCH 09/19] fix server --- fastdeploy/engine/engine.py | 4 ++-- fastdeploy/model_executor/models/qwen3moe.py | 24 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 8e4c4c6024..13ef54f80f 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -1195,10 +1195,10 @@ def detect_thread(): r'set state for layer (\d+)', line)): progress = eval(match.group( - 1)) * 1.0 / self.cfg.model_config.num_hidden_layers + 1)) * 1.0 / self.cfg.model_config.num_layers self.worker_init_status["layer_loadding"] = progress if self.worker_init_status[ - "layer_loadding"] == self.cfg.model_config.num_hidden_layers - 1: + "layer_loadding"] == self.cfg.model_config.num_layers - 1: self.worker_init_status["finished"] = True self.checking_worker_status_thread = threading.Thread( diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 5960a750de..a34a772b59 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -199,14 +199,14 @@ def __init__( f"{prefix}.mlp.experts.{{}}.down_proj.weight", } - if (fd_config.model_config.moe_num_experts is not None + if (fd_config.model_config.num_experts is not None and layer_id >= fd_config.model_config.moe_layer_start_index): self.mlp = FusedMoE(fd_config, moe_intermediate_size=fd_config.model_config. moe_intermediate_size, - num_experts=fd_config.model_config.moe_num_experts, - top_k=fd_config.model_config.moe_topk, + num_experts=fd_config.model_config.num_experts, + top_k=fd_config.model_config.num_experts_per_tok, layer_idx=layer_id, weight_key_map=weight_key_map) else: @@ -435,7 +435,7 @@ def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): num_attention_heads=config.num_attention_heads, ) - def get_tensor_parallel_split_mappings(num_layers, moe_num_experts): + def get_tensor_parallel_split_mappings(num_layers, num_experts): final_actions = {} base_actions = { @@ -486,23 +486,23 @@ def get_tensor_parallel_split_mappings(num_layers, moe_num_experts): for key, action in base_actions.items(): for i in range(num_layers): newkey = key.replace("layers.0.", f"layers.{i}.") - for j in range(moe_num_experts): + for j in range(num_experts): newkey2 = newkey.replace("experts.0.", f"experts.{j}.") final_actions[newkey2] = action return final_actions - moe_num_experts = 0 - if isinstance(config.moe_num_experts, list): - moe_num_experts = sum(config.moe_num_experts) - elif isinstance(config.moe_num_experts, int): - moe_num_experts = config.moe_num_experts + num_experts = 0 + if isinstance(config.num_experts, list): + num_experts = sum(config.num_experts) + elif isinstance(config.num_experts, int): + num_experts = config.num_experts else: raise ValueError( - f"Not support type of moe_num_experts [{type(config.moe_num_experts)}]" + f"Not support type of num_experts [{type(config.num_experts)}]" ) mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, - moe_num_experts) + num_experts) return mappings From 47979f521f972e4a01e93f2c53a7ccecce1f44f7 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 10 Jul 2025 13:08:39 +0000 Subject: [PATCH 10/19] resolve conflict --- fastdeploy/config.py | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 495266ae31..d1e26165ef 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -104,37 +104,6 @@ def __init__( if not hasattr(self, "head_dim"): self.head_dim = self.hidden_size // self.num_attention_heads -<<<<<<< HEAD -======= - else: - self.head_dim = head_dim - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.use_rope = use_rope - self.rope_theta = rope_theta - self.ori_vocab_size = ori_vocab_size or vocab_size - self.max_seq_len = max_seq_len - self.prefix_name = prefix_name - self.freeze_embedding = freeze_embedding - self.rope_head_dim = rope_head_dim - moe_num_experts = kwargs.get("moe_num_experts", 0) - if moe_layer_start_index is not None: - self.moe_layer_start_index = moe_layer_start_index - elif moe_num_experts == 0: - self.moe_layer_start_index = self.num_layers - self.moe_num_experts = 0 - if moe_layer_end_index is not None: - self.moe_layer_end_index = moe_layer_end_index - self.ffn_hidden_size = ffn_hidden_size - self.rope_3d = rope_3d - self.start_layer_index = start_layer_index - self.dtype = dtype - self.tie_word_embeddings = tie_word_embeddings - self.is_quantized = is_quantized - self.rms_norm_eps = rms_norm_eps ->>>>>>> 59071268b67c2596c58779899165f0286139968d if hasattr(self, "vision_config"): self.vision_config = PretrainedConfig.from_dict(self.vision_config) From 1f9d190f0ac6c1578a69d5f77f0fa865834283b3 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Fri, 11 Jul 2025 07:40:21 +0000 Subject: [PATCH 11/19] fix mtp --- fastdeploy/config.py | 21 ++++++++++++------- .../model_executor/load_weight_utils.py | 2 +- fastdeploy/worker/worker_process.py | 9 ++++++-- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index d1e26165ef..23b42a5ca0 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -92,7 +92,8 @@ def __init__( if hasattr(self, key): setattr(self, key, value) pretrained_config, _ = PretrainedConfig.get_config_dict(self.model_name_or_path) - + self.pretrained_config = pretrained_config + # set attribute from pretrained_config for key, value in pretrained_config.items(): setattr(self, key, value) @@ -164,7 +165,7 @@ def __init__( # self.eos_tokens_lens: int = 2 # Enable chunked prefill - self.enable_chunked_prefill: str = "store_true" + self.enable_chunked_prefill: bool = False self.max_num_batched_tokens: int = 2048 # enable prefix cache @@ -191,7 +192,6 @@ def __init__( # enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce). self.enable_custom_all_reduce: bool = False -@dataclass class SpeculativeConfig: """ Configuration for speculative decoding. @@ -229,9 +229,17 @@ def __init__( # This ensures that the specified simulation acceptance rate is not affected. self.benchmark_mode: bool = False + #TODO(YuanRisheng): The name of the server args is different from the name of the SpeculativeConfig. + #We temperately add the name map here and will delete it in future. + name_map = {"speculative_method": "method", + "speculative_max_draft_token_num": "num_speculative_tokens", + "speculative_model_name_or_path": "model_name_or_path", + "speculative_model_quantization": "quantization", + "speculative_benchmark_mode": "benchmark_mode"} + for key, value in args.items(): - if hasattr(self, key): - setattr(self, key, value) + if key in name_map.keys() and hasattr(self, name_map[key]): + setattr(self, name_map[key], value) class DeviceConfig: """ @@ -365,19 +373,16 @@ def __init__( if hasattr(self, key): setattr(self, key, value) -@dataclass class LoRAConfig: """ LoRA Config """ pass -@dataclass class KVCacheConfig: """ KV Cache Config """ cache_quant_dtype: str = "none" -@dataclass class DecodingConfig: """ Configuration for decoding diff --git a/fastdeploy/model_executor/load_weight_utils.py b/fastdeploy/model_executor/load_weight_utils.py index 9dbae9f27f..b0ad8648ba 100644 --- a/fastdeploy/model_executor/load_weight_utils.py +++ b/fastdeploy/model_executor/load_weight_utils.py @@ -282,7 +282,7 @@ def load_composite_checkpoint( else: state_dict = load_tp_checkpoint(model_path, cls, - fd_config.model_config, + fd_config.model_config.pretrained_config, return_numpy=return_numpy) if not state_dict: raise ValueError("weight not found in state_dict !") diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 0eab9943ad..fe636ef9ef 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -495,8 +495,8 @@ def parse_args(): ) parser.add_argument( "--speculative_benchmark_mode", - default="false", - type=str, + default=False, + type=bool, ) parser.add_argument("--max_num_batched_tokens", type=int, @@ -597,6 +597,11 @@ def initialize_fd_config(args: argparse.Namespace) -> FDConfig: args.max_capture_batch_size, vars(args)) + # Note(tangbinhan): used for load_checkpoint + model_config.pretrained_config.tensor_parallel_rank = parallel_config.tensor_parallel_rank + model_config.pretrained_config.tensor_parallel_degree = parallel_config.tensor_parallel_size + model_config.pretrained_config.is_mtp = speculative_config.is_mtp + logger.info(f"parallel_config.use_ep {parallel_config.use_ep}") logger.info( f"parallel_config.tensor_parallel_size {parallel_config.tensor_parallel_size}" From b2370c8e256a694df36e31454a66a38acc6ea008 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Fri, 11 Jul 2025 07:53:57 +0000 Subject: [PATCH 12/19] resolve conflict --- fastdeploy/config.py | 4 ++-- fastdeploy/worker/worker_process.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 23b42a5ca0..d884002ceb 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -92,8 +92,8 @@ def __init__( if hasattr(self, key): setattr(self, key, value) pretrained_config, _ = PretrainedConfig.get_config_dict(self.model_name_or_path) - self.pretrained_config = pretrained_config - + self.pretrained_config = PretrainedConfig.from_dict(pretrained_config) + # set attribute from pretrained_config for key, value in pretrained_config.items(): setattr(self, key, value) diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index fe636ef9ef..69668a0bb2 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -600,7 +600,7 @@ def initialize_fd_config(args: argparse.Namespace) -> FDConfig: # Note(tangbinhan): used for load_checkpoint model_config.pretrained_config.tensor_parallel_rank = parallel_config.tensor_parallel_rank model_config.pretrained_config.tensor_parallel_degree = parallel_config.tensor_parallel_size - model_config.pretrained_config.is_mtp = speculative_config.is_mtp + model_config.pretrained_config.is_mtp = False logger.info(f"parallel_config.use_ep {parallel_config.use_ep}") logger.info( From cf00d39ac5c1e010cddff55c596d5a7b3e2f5e3d Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Fri, 11 Jul 2025 08:15:39 +0000 Subject: [PATCH 13/19] fix xpu --- fastdeploy/worker/worker_process.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 69668a0bb2..d457f68cd4 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -601,6 +601,7 @@ def initialize_fd_config(args: argparse.Namespace) -> FDConfig: model_config.pretrained_config.tensor_parallel_rank = parallel_config.tensor_parallel_rank model_config.pretrained_config.tensor_parallel_degree = parallel_config.tensor_parallel_size model_config.pretrained_config.is_mtp = False + model_config.pretrained_config.head_dim = model_config.head_dim logger.info(f"parallel_config.use_ep {parallel_config.use_ep}") logger.info( From a845f29f26eb6d50b67121505a26abb01845d0c4 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Fri, 11 Jul 2025 10:01:15 +0000 Subject: [PATCH 14/19] fix xpu --- fastdeploy/config.py | 57 ++++++++++--------- .../model_executor/layers/embeddings.py | 6 ++ .../model_executor/models/deepseek_v3.py | 6 +- .../model_executor/models/ernie4_5_moe.py | 14 ++--- .../model_executor/models/ernie4_5_mtp.py | 8 +-- .../models/ernie4_5_vl/ernie4_5_vl_moe.py | 8 +-- fastdeploy/model_executor/models/qwen2.py | 10 ++-- fastdeploy/model_executor/models/qwen3.py | 14 ++--- fastdeploy/model_executor/models/qwen3moe.py | 17 +++--- fastdeploy/model_executor/models/tp_utils.py | 2 +- fastdeploy/spec_decode/mtp.py | 4 +- fastdeploy/worker/gcu_model_runner.py | 16 +++--- fastdeploy/worker/iluvatar_model_runner.py | 14 ++--- fastdeploy/worker/worker_process.py | 3 + 14 files changed, 96 insertions(+), 83 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index d884002ceb..55274110db 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -21,6 +21,7 @@ from typing import Literal, Optional from paddleformers.transformers.configuration_utils import PretrainedConfig +from paddleformers.trl import llm_utils from fastdeploy import envs from fastdeploy.model_executor.layers.quantization.quant_base import \ @@ -37,6 +38,29 @@ class MoEPhase(Enum): PREFILL = 1 DECODER = 2 +PRETRAINED_INIT_CONFIGURATION = { + "rope_theta": 10000.0, + "num_key_value_heads":-1, + "start_layer_index": 0, + "moe_num_shared_experts":0, + "moe_layer_start_index": 0, + "num_max_dispatch_tokens_per_rank":256, + "moe_use_aux_free":False, + "vocab_size": -1, + "use_rope": True, + "hidden_dropout_prob":0.0, + "initializer_range":0.02, + "max_position_embeddings":512, + "quantization_config":None, + "use_recompute_resampler":False, + "use_temporal_conv":True, + "resampler_fuse_rms_norm":False, + "freq_allocation":20, + "tie_word_embeddings":False, + "rms_norm_eps":1e-5, +} + + class ModelConfig: """ The configuration class to store the configuration of a `LLM`. @@ -66,31 +90,10 @@ def __init__( self.dtype = "" self.enable_logprob = False - PRETRAINED_INIT_CONFIGURATION = { - "rope_theta": 10000.0, - "num_key_value_heads":-1, - "start_layer_index": 0, - "moe_num_shared_experts":0, - "moe_layer_start_index": 0, - "num_max_dispatch_tokens_per_rank":256, - "moe_use_aux_free":False, - "vocab_size": -1, - "use_rope": True, - "hidden_dropout_prob":0.0, - "initializer_range":0.02, - "max_position_embeddings":512, - "quantization_config":None, - "use_recompute_resampler":False, - "use_temporal_conv":True, - "resampler_fuse_rms_norm":False, - "freq_allocation":20, - "tie_word_embeddings":False, - "rms_norm_eps":1e-5, - } - for key, value in args.items(): if hasattr(self, key): setattr(self, key, value) + pretrained_config, _ = PretrainedConfig.get_config_dict(self.model_name_or_path) self.pretrained_config = PretrainedConfig.from_dict(pretrained_config) @@ -123,10 +126,12 @@ def __init__( self.use_ep = False # Whether to enable Expert Parallelism self.moe_phase = MoEPhase.PREFILL # Generation phase self.msg_queue_id = 1 # mesage queue id - self.tensor_parallel_rank = None # TP rank ID - self.tensor_parallel_size = None # TP degree - self.expert_parallel_rank = None # EP rank ID - self.expert_parallel_size= None # EP degree + + tensor_parallel_rank, tensor_parallel_size = llm_utils.init_dist_env() + self.tensor_parallel_rank = tensor_parallel_rank # TP rank ID + self.tensor_parallel_size = tensor_parallel_size # TP degree + self.expert_parallel_rank = int(tensor_parallel_rank / tensor_parallel_size) # EP rank ID + self.expert_parallel_size = 1 # EP degree # The embedding weight distributed on your gpu cards is divided by row or column. # Defaults to False means divide by row. When vocab_size can not be divided by world_size # but hidden_size can, we can consider split embedding weight by column. diff --git a/fastdeploy/model_executor/layers/embeddings.py b/fastdeploy/model_executor/layers/embeddings.py index 44b270cf61..487aceb187 100644 --- a/fastdeploy/model_executor/layers/embeddings.py +++ b/fastdeploy/model_executor/layers/embeddings.py @@ -112,6 +112,12 @@ def load_state_dict(self, state_dict: Dict[str, Args: state_dict (dict): A dictionary containing the checkpoint weights and biases. """ + from fastdeploy.utils import get_logger + + logger = get_logger("other", "other.log") + logger.info(f"prefix: {self.prefix}") + a = state_dict[self.prefix + ".weight"] + logger.info(f"tensor : {a}") if self.tie_word_embeddings: self.word_embeddings.weight.set_value( get_tensor(state_dict[self.prefix + ".weight"]).astype( diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py index 7161155ab1..a9ac1d50b4 100644 --- a/fastdeploy/model_executor/models/deepseek_v3.py +++ b/fastdeploy/model_executor/models/deepseek_v3.py @@ -27,6 +27,7 @@ from fastdeploy.config import FDConfig from fastdeploy.distributed.communication_op import \ tensor_model_parallel_all_reduce +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.layers.activation import SiluAndMul from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding @@ -40,7 +41,6 @@ DeepseekScalingRotaryEmbedding from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.platforms import current_platform -from fastdeploy.model_executor.forward_meta import ForwardMeta if current_platform.is_cuda(): from fastdeploy.model_executor.ops.gpu import \ @@ -526,7 +526,7 @@ def __init__( """ super().__init__() self.num_layers = fd_config.model_config.num_hidden_layers - fd_config.model_config.prefix_name = "deepseek_v3" + fd_config.model_config.pretrained_config.prefix_name = "deepseek_v3" self.embeddings = VocabParallelEmbedding( fd_config, @@ -539,7 +539,7 @@ def __init__( self.decoder_layers = nn.LayerList([ DeepSeekV3DecoderLayer( fd_config, - prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") for i in range(self.num_layers) ]) diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index b7ed856e6c..1bc6209d05 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -25,7 +25,8 @@ from paddleformers.transformers import PretrainedModel from paddleformers.utils.log import logger -from fastdeploy.config import FDConfig, ModelConfig +from fastdeploy.config import FDConfig +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.decorator import \ support_graph_optimization from fastdeploy.model_executor.layers.activation import SiluAndMul @@ -41,7 +42,6 @@ from fastdeploy.model_executor.models.utils import \ LayerIdPlaceholder as layerid from fastdeploy.model_executor.models.utils import WeightMeta -from fastdeploy.model_executor.forward_meta import ForwardMeta class Ernie4_5_MLP(nn.Layer): @@ -347,19 +347,19 @@ def __init__( super().__init__() self.num_layers = fd_config.model_config.num_hidden_layers - fd_config.model_config.prefix_name = "ernie" + fd_config.model_config.pretrained_config.prefix_name = "ernie" self.embeddings = VocabParallelEmbedding( fd_config=fd_config, num_embeddings=fd_config.model_config.vocab_size, embedding_dim=fd_config.model_config.hidden_size, params_dtype=paddle.get_default_dtype(), - prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens")) + prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens")) self.hidden_layers = nn.LayerList([ Ernie4_5_DecoderLayer( fd_config=fd_config, - prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") for i in range(self.num_layers) ]) @@ -367,7 +367,7 @@ def __init__( fd_config, hidden_size=fd_config.model_config.hidden_size, eps=fd_config.model_config.rms_norm_eps, - prefix=f"{fd_config.model_config.prefix_name}.norm", + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm", ) def load_state_dict(self, state_dict): @@ -559,7 +559,7 @@ def _init_weight(self, layer): ] @classmethod - def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): + def _get_tensor_parallel_mappings(cls, config, is_split=True): """ get_tensor_parallel_mappings """ diff --git a/fastdeploy/model_executor/models/ernie4_5_mtp.py b/fastdeploy/model_executor/models/ernie4_5_mtp.py index 4f9c571ce7..02a711c949 100644 --- a/fastdeploy/model_executor/models/ernie4_5_mtp.py +++ b/fastdeploy/model_executor/models/ernie4_5_mtp.py @@ -25,12 +25,12 @@ from paddleformers.transformers import PretrainedModel from paddleformers.utils.log import logger -from fastdeploy.config import FDConfig, ModelConfig +from fastdeploy.config import FDConfig +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.layers.mtp_linear import ParallelEHProjection from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_DecoderLayer from fastdeploy.model_executor.models.model_base import ModelForCasualLM -from fastdeploy.model_executor.forward_meta import ForwardMeta class Ernie4_5_MTPPretrainedModel(PretrainedModel): @@ -47,7 +47,7 @@ def _init_weight(self, layer): return None @classmethod - def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): + def _get_tensor_parallel_mappings(cls, config, is_split=True): """ get_tensor_parallel_mappings """ @@ -268,7 +268,7 @@ def __init__( self.hidden_layers = nn.LayerList([ Ernie4_5_DecoderLayer( fd_config=fd_config, - prefix=f"{fd_config.model_config.prefix_name}.{i}") + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.{i}") for i in range(self.num_layers) ]) diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index 6a0ccab404..51dc299852 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -338,20 +338,20 @@ def __init__( self.num_layers = fd_config.model_config.num_hidden_layers self.im_patch_id = fd_config.model_config.im_patch_id self._dtype = fd_config.model_config.dtype - fd_config.model_config.prefix_name = "ernie" + fd_config.model_config.pretrained_config.prefix_name = "ernie" self.embeddings = VocabParallelEmbedding( fd_config=fd_config, num_embeddings=fd_config.model_config.vocab_size, embedding_dim=fd_config.model_config.hidden_size, params_dtype=paddle.get_default_dtype, - prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"), + prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"), ) self.hidden_layers = nn.LayerList([ Ernie4_5_VLDecoderLayer( fd_config=fd_config, - prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") for i in range(self.num_layers) ]) @@ -359,7 +359,7 @@ def __init__( fd_config, hidden_size=fd_config.model_config.hidden_size, eps=fd_config.model_config.rms_norm_eps, - prefix=f"{fd_config.model_config.prefix_name}.norm", + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm", ) def load_state_dict(self, state_dict): diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py index 8ead153597..81e0041079 100644 --- a/fastdeploy/model_executor/models/qwen2.py +++ b/fastdeploy/model_executor/models/qwen2.py @@ -24,6 +24,7 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig, ModelConfig +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.decorator import \ support_graph_optimization from fastdeploy.model_executor.layers.activation import SiluAndMul @@ -34,7 +35,6 @@ from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.models.model_base import ModelForCasualLM -from fastdeploy.model_executor.forward_meta import ForwardMeta class Qwen2MLP(nn.Layer): @@ -228,20 +228,20 @@ def __init__( super().__init__() self.num_layers = fd_config.model_config.num_hidden_layers - fd_config.model_config.prefix_name = "qwen2" + fd_config.model_config.pretrained_config.prefix_name = "qwen2" self.embeddings = VocabParallelEmbedding( fd_config=fd_config, num_embeddings=fd_config.model_config.vocab_size, embedding_dim=fd_config.model_config.hidden_size, params_dtype=paddle.get_default_dtype, - prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"), + prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"), ) self.layers = nn.LayerList([ Qwen2DecoderLayer( fd_config=fd_config, - prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") for i in range(self.num_layers) ]) @@ -249,7 +249,7 @@ def __init__( fd_config, hidden_size=fd_config.model_config.hidden_size, eps=fd_config.model_config.rms_norm_eps, - prefix=f"{fd_config.model_config.prefix_name}.norm", + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm", ) def load_state_dict(self, state_dict): diff --git a/fastdeploy/model_executor/models/qwen3.py b/fastdeploy/model_executor/models/qwen3.py index 587a95611e..5a75a868ea 100644 --- a/fastdeploy/model_executor/models/qwen3.py +++ b/fastdeploy/model_executor/models/qwen3.py @@ -23,7 +23,8 @@ from paddleformers.transformers import PretrainedModel from paddleformers.utils.log import logger -from fastdeploy.config import FDConfig, ModelConfig +from fastdeploy.config import FDConfig +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.decorator import \ support_graph_optimization from fastdeploy.model_executor.layers.attention.attention import Attention @@ -34,7 +35,6 @@ from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP -from fastdeploy.model_executor.forward_meta import ForwardMeta class Qwen3MLP(Qwen2MLP): @@ -164,20 +164,20 @@ def __init__( super().__init__() self.num_layers = fd_config.model_config.num_hidden_layers - fd_config.model_config.prefix_name = "model" + fd_config.model_config.pretrained_config.prefix_name = "model" self.embeddings = VocabParallelEmbedding( fd_config=fd_config, num_embeddings=fd_config.model_config.vocab_size, embedding_dim=fd_config.model_config.hidden_size, params_dtype=paddle.get_default_dtype, - prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"), + prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"), ) self.layers = nn.LayerList([ Qwen3DecoderLayer( fd_config=fd_config, - prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") for i in range(self.num_layers) ]) @@ -185,7 +185,7 @@ def __init__( fd_config, hidden_size=fd_config.model_config.hidden_size, eps=fd_config.model_config.rms_norm_eps, - prefix=f"{fd_config.model_config.prefix_name}.norm", + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm", ) def load_state_dict(self, state_dict): @@ -307,7 +307,7 @@ def _init_weight(self, layer): return None @classmethod - def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): + def _get_tensor_parallel_mappings(cls, config, is_split=True): from paddleformers.transformers.conversion_utils import \ split_or_merge_func diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index e3ba71ab23..2ea37da4e1 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -23,20 +23,19 @@ from paddleformers.transformers import PretrainedModel from paddleformers.utils.log import logger -from fastdeploy.config import FDConfig, ModelConfig +from fastdeploy.config import FDConfig +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.decorator import \ support_graph_optimization from fastdeploy.model_executor.layers.activation import SiluAndMul -from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding from fastdeploy.model_executor.layers.linear import ( - MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) + MergedColumnParallelLinear, RowParallelLinear) from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.moe.moe import FusedMoE from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.qwen3 import Qwen3Attention -from fastdeploy.model_executor.forward_meta import ForwardMeta class Qwen3MLP(nn.Layer): @@ -200,20 +199,20 @@ def __init__( super().__init__() self.num_layers = fd_config.model_config.num_hidden_layers - fd_config.model_config.prefix_name = "model" + fd_config.model_config.pretrained_config.prefix_name = "model" self.embeddings = VocabParallelEmbedding( fd_config, num_embeddings=fd_config.model_config.vocab_size, embedding_dim=fd_config.model_config.hidden_size, params_dtype=paddle.get_default_dtype, - prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"), + prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"), ) self.layers = nn.LayerList([ Qwen3DecoderLayer( fd_config, - prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") for i in range(self.num_layers) ]) @@ -221,7 +220,7 @@ def __init__( fd_config, hidden_size=fd_config.model_config.hidden_size, eps=1e-6, - prefix=f"{fd_config.model_config.prefix_name}.norm", + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm", ) def load_state_dict(self, state_dict): @@ -338,7 +337,7 @@ def _init_weight(self, layer): return None @classmethod - def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): + def _get_tensor_parallel_mappings(cls, config, is_split=True): # TODO not support TP split now, next PR will support TP. from paddleformers.transformers.conversion_utils import \ diff --git a/fastdeploy/model_executor/models/tp_utils.py b/fastdeploy/model_executor/models/tp_utils.py index 426651a62f..ab2b7ab946 100644 --- a/fastdeploy/model_executor/models/tp_utils.py +++ b/fastdeploy/model_executor/models/tp_utils.py @@ -38,7 +38,7 @@ def check_tensor_parallel_prerequisites( """check_tensor_parallel_prerequisites""" if fd_config.parallel_config.tensor_parallel_size > 1: tensor_parallel_map = cls._get_tensor_parallel_mappings( - fd_config.model_config, is_split=True) + fd_config.model_config.pretrained_config, is_split=True) if not tensor_parallel_map: logger.error("filtered_quant_map should not be empty. \ parallel splitting required, but _get_tensor_parallel_mappings is not implemented." diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index 26b9e2948f..ed25bc78a4 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -21,6 +21,7 @@ import paddle from fastdeploy.engine.request import Request +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.layers.attention import get_attention_backend from fastdeploy.model_executor.layers.attention.base_attention_backend import \ AttentionBackend @@ -36,7 +37,6 @@ share_external_data) from fastdeploy.model_executor.pre_and_post_process import (pre_process, rebuild_padding) -from fastdeploy.model_executor.forward_meta import ForwardMeta from .base import Proposer @@ -73,7 +73,7 @@ def _update_cfg(self, main_model): self.model_config.num_hidden_layers = 1 self.parallel_config.model_name_or_path = ( self.speculative_config.model_name_or_path) - self.model_config.prefix_name = "ernie.mtp_block" + self.model_config.pretrained_config.prefix_name = "ernie.mtp_block" if self.speculative_config.quantization != "": self.model_config.quantization = ( self.speculative_config.quantization) diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index ba2879eaaf..7e7349c410 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -24,6 +24,7 @@ from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.guided_decoding import get_guided_backend from fastdeploy.model_executor.guided_decoding.base_guided_decoding import \ LogitsProcessorBase @@ -39,7 +40,6 @@ from fastdeploy.model_executor.pre_and_post_process import (post_process, pre_process, rebuild_padding) -from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.worker.model_runner_base import ModelRunnerBase from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput @@ -670,7 +670,7 @@ def initialize_kv_cache(self) -> None: # Get kv cache shape kv_cache_shape = self.attn_backends[0].get_kv_cache_shape( max_num_blocks=max_block_num) - # local_rank = self.local_rank % self.parallel_config.tensor_parallel_degree + # local_rank = self.local_rank % self.parallel_config.tensor_parallel_size if not self.parallel_config.do_profile and ( self.parallel_config.enable_prefix_caching \ @@ -701,10 +701,10 @@ def initialize_attn_backend(self) -> None: """ assert len(self.attn_backends) == 0 - num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree + num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size self.model_config.kv_num_heads = int( self.model_config.num_key_value_heads - ) // self.parallel_config.tensor_parallel_degree + ) // self.parallel_config.tensor_parallel_size head_dim = self.model_config.head_dim # Get the attention backend @@ -783,14 +783,14 @@ def _dummy_run(self, ) sampler_output = self.sampler(logits, self.sampling_metadata) - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) else: self.sampler(logits, self.sampling_metadata, self.parallel_config.max_model_len, self.share_inputs) sampler_output = None - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast( self.share_inputs["accept_tokens"], 0) paddle.distributed.broadcast( @@ -1016,14 +1016,14 @@ class at the server level, which is too granular for ModelRunner. self.sampling_metadata, skip_idx_list, ) - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) else: self.sampler(logits, self.sampling_metadata, self.parallel_config.max_model_len, self.share_inputs) sampler_output = None - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast( self.share_inputs["accept_tokens"], 0) paddle.distributed.broadcast(self.share_inputs["accept_num"], diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index 9fee4268da..565e9abbe7 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -24,6 +24,7 @@ from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.layers.attention import get_attention_backend from fastdeploy.model_executor.layers.attention.base_attention_backend import \ AttentionBackend @@ -37,7 +38,6 @@ pre_process, rebuild_padding, step_cuda) -from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.worker.model_runner_base import ModelRunnerBase from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput @@ -672,11 +672,11 @@ def initialize_attn_backend(self) -> None: assert len(self.attn_backends) == 0 # TODO(gongshaotian): Get rank from config - num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree + num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size self.model_config.kv_num_heads = max( 1, int(self.model_config.num_key_value_heads) // - self.parallel_config.tensor_parallel_degree) + self.parallel_config.tensor_parallel_size) head_dim = self.model_config.head_dim # Get the attention backend @@ -748,14 +748,14 @@ def _dummy_run(self, ) sampled_token_ids = self.sampler(logits, self.sampling_metadata) - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampled_token_ids, 0) else: self.sampler(logits, self.sampling_metadata, self.parallel_config.max_model_len, self.share_inputs) sampled_token_ids = None - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast( self.share_inputs["accept_tokens"], 0) paddle.distributed.broadcast( @@ -977,14 +977,14 @@ class at the server level, which is too granular for ModelRunner. self.sampling_metadata, skip_idx_list, ) - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampled_token_ids, 0) else: self.sampler(logits, self.sampling_metadata, self.parallel_config.max_model_len, self.share_inputs) sampled_token_ids = None - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast( self.share_inputs["accept_tokens"], 0) paddle.distributed.broadcast(self.share_inputs["accept_num"], diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index d457f68cd4..e7d4967d79 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -607,6 +607,9 @@ def initialize_fd_config(args: argparse.Namespace) -> FDConfig: logger.info( f"parallel_config.tensor_parallel_size {parallel_config.tensor_parallel_size}" ) + logger.info( + f"parallel_config.tensor_parallel_rank {parallel_config.tensor_parallel_rank}" + ) if getattr(model_config, 'num_hidden_layers', None) is None: raise ValueError("num_hidden_layers is None") From ae67d3934234d8a23099e48be88bc6099f2fb11d Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Mon, 14 Jul 2025 07:01:16 +0000 Subject: [PATCH 15/19] fix vl --- fastdeploy/worker/vl_gpu_model_runner.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fastdeploy/worker/vl_gpu_model_runner.py b/fastdeploy/worker/vl_gpu_model_runner.py index 7fe9c465be..dce6e7c487 100644 --- a/fastdeploy/worker/vl_gpu_model_runner.py +++ b/fastdeploy/worker/vl_gpu_model_runner.py @@ -31,6 +31,7 @@ SpeculativeConfig) from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.input.mm_processor import DataProcessor +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.layers.attention import get_attention_backend from fastdeploy.model_executor.layers.rotary_embedding import get_rope_3d from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata @@ -44,7 +45,6 @@ from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import ( ScatterOp, VariableResolutionResamplerModel) from fastdeploy.platforms import current_platform -from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.worker.output import SamplerOutput from fastdeploy.worker.utils import check_safetensors_model from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase @@ -1157,6 +1157,9 @@ def build_stream_line_model( ) logger.info("============================================") + # TODO(YuanRisheng) The moe_k in develop is fixed to 8, need to be changed according to json config + model_config.moe_k = 8 + fd_config = FDConfig( model_config=model_config, parallel_config=parallel_config, From 3be577c68e336be5c2f18bb6c09e3f105446f4f0 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Mon, 14 Jul 2025 08:09:14 +0000 Subject: [PATCH 16/19] fix log --- fastdeploy/model_executor/layers/embeddings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fastdeploy/model_executor/layers/embeddings.py b/fastdeploy/model_executor/layers/embeddings.py index e843bc7544..a0fb4fcc42 100644 --- a/fastdeploy/model_executor/layers/embeddings.py +++ b/fastdeploy/model_executor/layers/embeddings.py @@ -113,7 +113,6 @@ def load_state_dict(self, state_dict: Dict[str, state_dict (dict): A dictionary containing the checkpoint weights and biases. """ a = state_dict[self.prefix + ".weight"] - logger.info(f"tensor : {a}") if self.tie_word_embeddings: self.word_embeddings.weight.set_value( get_tensor(state_dict[self.prefix + ".weight"]).astype( From 8ebbfa2749847acb2ab109158dd41a7e1495efdf Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Mon, 14 Jul 2025 09:16:45 +0000 Subject: [PATCH 17/19] fix qwen moe --- fastdeploy/model_executor/models/qwen3moe.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 2ea37da4e1..2e0c91ac33 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -114,13 +114,13 @@ def __init__( f"{prefix}.mlp.experts.{{}}.down_proj.weight", } - if (fd_config.model_config.num_experts is not None + if (fd_config.model_config.moe_num_experts is not None and layer_id >= fd_config.model_config.moe_layer_start_index): self.mlp = FusedMoE(fd_config, moe_intermediate_size=fd_config.model_config. moe_intermediate_size, - num_experts=fd_config.model_config.num_experts, + num_experts=fd_config.model_config.moe_num_experts, top_k=fd_config.model_config.num_experts_per_tok, layer_idx=layer_id, weight_key_map=weight_key_map) @@ -408,13 +408,13 @@ def get_tensor_parallel_split_mappings(num_layers, num_experts): return final_actions num_experts = 0 - if isinstance(config.num_experts, list): - num_experts = sum(config.num_experts) - elif isinstance(config.num_experts, int): - num_experts = config.num_experts + if isinstance(config.moe_num_experts, list): + num_experts = sum(config.moe_num_experts) + elif isinstance(config.moe_num_experts, int): + num_experts = config.moe_num_experts else: raise ValueError( - f"Not support type of num_experts [{type(config.num_experts)}]" + f"Not support type of num_experts [{type(config.moe_num_experts)}]" ) mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, From 63db1cb2fb56541b36b7ded89082f133a84d49c5 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Mon, 14 Jul 2025 09:44:48 +0000 Subject: [PATCH 18/19] fix qwen moe --- fastdeploy/model_executor/models/qwen3moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 2e0c91ac33..94cbbe10d2 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -121,7 +121,7 @@ def __init__( moe_intermediate_size=fd_config.model_config. moe_intermediate_size, num_experts=fd_config.model_config.moe_num_experts, - top_k=fd_config.model_config.num_experts_per_tok, + top_k=fd_config.model_config.top_k, layer_idx=layer_id, weight_key_map=weight_key_map) else: From 14703d34ef8d638e8bec4242a26141fe428343c6 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Mon, 14 Jul 2025 10:56:45 +0000 Subject: [PATCH 19/19] fix qwen moe --- fastdeploy/model_executor/models/qwen3moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 94cbbe10d2..b222f48abf 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -121,7 +121,7 @@ def __init__( moe_intermediate_size=fd_config.model_config. moe_intermediate_size, num_experts=fd_config.model_config.moe_num_experts, - top_k=fd_config.model_config.top_k, + top_k=fd_config.model_config.moe_topk, layer_idx=layer_id, weight_key_map=weight_key_map) else: