diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 2f4f4b19df..d236978919 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -21,14 +21,15 @@ from typing import Literal, Optional, Union from paddleformers.transformers.configuration_utils import PretrainedConfig +from paddleformers.trl import llm_utils +from fastdeploy import envs from fastdeploy.model_executor.layers.quantization.quant_base import \ QuantConfigBase from fastdeploy.utils import get_logger logger = get_logger("config", "config.log") - class MoEPhase(Enum): """ The generation phase of the moe. @@ -37,274 +38,228 @@ class MoEPhase(Enum): PREFILL = 1 DECODER = 2 - -class ModelConfig(PretrainedConfig): +PRETRAINED_INIT_CONFIGURATION = { + "rope_theta": 10000.0, + "num_key_value_heads":-1, + "start_layer_index": 0, + "moe_num_shared_experts":0, + "moe_layer_start_index": 0, + "num_max_dispatch_tokens_per_rank":256, + "moe_use_aux_free":False, + "vocab_size": -1, + "use_rope": True, + "hidden_dropout_prob":0.0, + "initializer_range":0.02, + "max_position_embeddings":512, + "quantization_config":None, + "use_recompute_resampler":False, + "use_temporal_conv":True, + "resampler_fuse_rms_norm":False, + "freq_allocation":20, + "tie_word_embeddings":False, + "rms_norm_eps":1e-5, +} + + +class ModelConfig: """ The configuration class to store the configuration of a `LLM`. """ - max_stop_seqs_num = 5 - stop_seqs_max_len = 8 - - architectures: list[str] = [] - - # NOTE(gongshaotain): form _load_model_init_val() - top_p = 0.0 - temperature = 1.0 - rope_theta = 10000.0 - penalty_score = 1.0 - frequency_score = 0.0 - presence_score = 0.0 - min_length = 1 - def __init__( self, - vocab_size: int = 100224, - hidden_size: int = 4096, - num_layers: int = 48, - num_attention_heads: int = 32, - num_key_value_heads: Optional[int] = None, - hidden_act: str = "swiglu", - hidden_dropout_prob: float = 0.0, - max_position_embeddings: int = 512, - max_seq_len: int = 512, - initializer_range: float = 0.02, - use_rope=True, - rope_theta: int = 10000, - rope_3d: bool = False, - ori_vocab_size: int | None = None, - moe_layer_start_index: Union[int, list[int], None] = None, - moe_num_experts: Union[int, list[int], None] = None, - moe_layer_end_index: Union[int, list[int], None] = None, - moe_num_shared_experts: int | None = None, - num_hidden_layers: int | None = None, - prefix_name="", - freeze_embedding=False, - rope_head_dim=None, - ffn_hidden_size: Optional[int] = None, - dtype="bfloat16", - start_layer_index: int = 0, - head_dim: Optional[int] = None, - tie_word_embeddings: bool = False, - is_quantized: bool = False, - rms_norm_eps: float = 1e-5, - **kwargs, + args, ): - super().__init__(**kwargs) - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_layers = num_layers - if num_hidden_layers is not None: - self.num_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - if head_dim is None: + self.max_stop_seqs_num = 5 + self.stop_seqs_max_len = 8 + + # NOTE(gongshaotain): form _load_model_init_val() + self.top_p = 0.0 + self.temperature = 1.0 + self.rope_theta = 10000.0 + self.penalty_score = 1.0 + self.frequency_score = 0.0 + self.presence_score = 0.0 + self.min_length = 1 + self.model_name_or_path = "" + + self.im_patch_id = ( + 100295 # multimodality, TODO(liuyuanle): read from config.json + ) + self.is_quantized = False + self.max_model_len = 0 + self.dtype = "" + self.enable_logprob = False + + for key, value in args.items(): + if hasattr(self, key): + setattr(self, key, value) + + pretrained_config, _ = PretrainedConfig.get_config_dict(self.model_name_or_path) + self.pretrained_config = PretrainedConfig.from_dict(pretrained_config) + + # set attribute from pretrained_config + for key, value in pretrained_config.items(): + setattr(self, key, value) + + # we need set default value when not exist + for key, value in PRETRAINED_INIT_CONFIGURATION.items(): + if not hasattr(self, key): + setattr(self, key, value) + + if not hasattr(self, "head_dim"): self.head_dim = self.hidden_size // self.num_attention_heads - else: - self.head_dim = head_dim - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.initializer_range = initializer_range - self.use_rope = use_rope - self.rope_theta = rope_theta - self.ori_vocab_size = ori_vocab_size or vocab_size - self.max_seq_len = max_seq_len - self.prefix_name = prefix_name - self.freeze_embedding = freeze_embedding - self.rope_head_dim = rope_head_dim - self.moe_layer_start_index = moe_layer_start_index - self.moe_num_experts = moe_num_experts - self.moe_num_shared_experts = moe_num_shared_experts - self.moe_layer_end_index = moe_layer_end_index - self.ffn_hidden_size = ffn_hidden_size - self.rope_3d = rope_3d - self.start_layer_index = start_layer_index - self.dtype = dtype - self.tie_word_embeddings = tie_word_embeddings - self.is_quantized = is_quantized - self.rms_norm_eps = rms_norm_eps + if hasattr(self, "vision_config"): + self.vision_config = PretrainedConfig.from_dict(self.vision_config) -@dataclass -class MoEConfig: - """ - Configuration for MoE. - """ - num_experts: Union[int, list[int], None] = None - top_k: int = 8 - moe_intermediate_size: int = -1 - num_experts_per_rank: int = -1 - num_experts_start_offset: int = -1 - - moe_num_shared_experts = (0, ) - moe_layer_start_index: Union[int, list[int], None] = None - moe_layer_end_index: Union[int, list[int], None] = None - moe_use_aux_free: bool = False - num_max_dispatch_tokens_per_rank = 256 - im_patch_id = ( - 100295 # multimodality, TODO(liuyuanle): read from config.json - ) - + self.ori_vocab_size = self.vocab_size + if "Ernie4_5_ForCausalLM" in self.architectures or "Ernie4_5_MoeForCausalLM" in self.architectures: + self.ori_vocab_size = args["ori_vocab_size"] -@dataclass class ParallelConfig: """Configuration for the distributed execution.""" - block_size = 16 # The block size for processing. - sequence_parallel = False # Whether to enable sequence parallelism. - use_ep = False # Whether to enable Expert Parallelism - moe_phase = MoEPhase.PREFILL # Generation phase - msg_queue_id = 1 # mesage queue id - tensor_parallel_rank = None # TP rank ID - tensor_parallel_degree = None # TP degree - expert_parallel_rank = None # EP rank ID - expert_parallel_degree = None # EP degree - # The embedding weight distributed on your gpu cards is divided by row or column. - # Defaults to False means divide by row. When vocab_size can not be divided by world_size - # but hidden_size can, we can consider split embedding weight by column. - """ - From old wersion worker args - TODO(gongshaotian): Reclassify - """ - model_name_or_path: str = "./output" - max_num_seqs: int = 34 - # Set default block num for profile run - max_block_num: int = 2000 - # block size - block_size: int = 64 - # Engine worker queue port - engine_worker_queue_port: int = 9923 - # Max model len - max_model_len: int = 3072 # max_seq_len - # cuda visible devices - device_ids: str = "0" - # Input dtype - dtype: str = "bfloat16" - # Encoder's decoder num - enc_dec_block_num: int = 1 - # KV cache ratio for input - kv_cache_ratio: float = 0.7 - # First token id - first_token_id: int = 1 - # Gpu memory utilization - gpu_memory_utilization: float = 0.9 - # Process ID of engine - engine_pid: Optional[int] = None - # Do profile or not - do_profile: bool = False - # - pad_token_id: int = -1 - # - eos_tokens_lens: int = 2 - # Enable chunked prefill - enable_chunked_prefill: str = "store_true" - - max_num_batched_tokens: int = 2048 - # enable prefix cache - enable_prefix_caching = None - # splitwise role - splitwise_role: str = "mixed" - # guided decoding backend - guided_decoding_backend: str = None - # disable any whitespace for guided decoding - disable_any_whitespace: bool = True - # enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce). - enable_custom_all_reduce: str = "store_true" - + def __init__( + self, + args, + ): + self.sequence_parallel = False # Whether to enable sequence parallelism. + self.use_ep = False # Whether to enable Expert Parallelism + self.moe_phase = MoEPhase.PREFILL # Generation phase + self.msg_queue_id = 1 # mesage queue id + + tensor_parallel_rank, tensor_parallel_size = llm_utils.init_dist_env() + self.tensor_parallel_rank = tensor_parallel_rank # TP rank ID + self.tensor_parallel_size = tensor_parallel_size # TP degree + self.expert_parallel_rank = int(tensor_parallel_rank / tensor_parallel_size) # EP rank ID + self.expert_parallel_size = 1 # EP degree + # The embedding weight distributed on your gpu cards is divided by row or column. + # Defaults to False means divide by row. When vocab_size can not be divided by world_size + # but hidden_size can, we can consider split embedding weight by column. + """ + From old wersion worker args + TODO(gongshaotian): Reclassify + """ + self.model_name_or_path: str = "./output" + self.max_num_seqs: int = 34 + # Set default block num for profile run + self.max_block_num: int = 2000 + # block size + self.block_size: int = 64 + # Engine worker queue port + self.engine_worker_queue_port: int = 9923 + # Max model len + self.max_model_len: int = 3072 # max_seq_len + # cuda visible devices + self.device_ids: str = "0" + # Input dtype + self.dtype: str = "bfloat16" + # Encoder's decoder num + self.enc_dec_block_num: int = 1 + # KV cache ratio for input + self.kv_cache_ratio: float = 0.7 + # First token id + self.first_token_id: int = 1 + # Gpu memory utilization + self.gpu_memory_utilization: float = 0.9 + # Process ID of engine + self.engine_pid: Optional[int] = None + # Do profile or not + self.do_profile: bool = False + # + self.pad_token_id: int = -1 + # + self.eos_tokens_lens: int = 2 + # Enable chunked prefill + self.enable_chunked_prefill: bool = False + + self.max_num_batched_tokens: int = 2048 + # enable prefix cache + self.enable_prefix_caching = None + # splitwise role + self.splitwise_role: str = "mixed" + # guided decoding backend + self.guided_decoding_backend: str = None + # disable any whitespace for guided decoding + self.disable_any_whitespace: bool = True + self.pod_ip: str = None + for key, value in args.items(): + if hasattr(self, key): + setattr(self, key, value) + self.use_ep = args["expert_parallel_size"] > 1 + if self.splitwise_role == "mixed": + self.moe_phase = MoEPhase.PREFILL + elif self.splitwise_role == "prefill": + self.moe_phase = MoEPhase.PREFILL + elif self.splitwise_role == "decode": + self.moe_phase = MoEPhase.DECODER + else: + raise NotImplementedError + # enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce). + self.enable_custom_all_reduce: bool = False -@dataclass class SpeculativeConfig: """ Configuration for speculative decoding. """ - # speculative method, choose in [None, "ngram_match", "mtp"] - method: Optional[str] = None - # the max length of speculative tokens - num_speculative_tokens: int = 1 - # the max length of candidate tokens for speculative method - max_candidate_len: int = 5 - # the max length of verify window for speculative method - verify_window: int = 2 - # ngram match - max_ngram_size: int = 5 - # model for mtp/eagle/draft_model - model_name_or_path: Optional[str] = None - # quantization of model - quantization: Optional[str] = None - # allocate more blocks to prevent mtp from finishing the block earlier than the main model - # Fixed now - num_gpu_block_expand_ratio: Optional[float] = 1 - # To distinguish the main model and draft model(mtp/eagle/draftmodel) - # ["main", "mtp"] - model_type: Optional[str] = "main" - # TODO(liuzichang): To reduce memory usage, MTP shares the main model's lm_head and embedding layers. - # A trick method is currently used to enable this sharing. - # This will be replaced with a more standardized solution in the future. - sharing_model = None - # During benchmarking, we need to enforce that the number of accepted tokens is 1. - # This means no tokens from MTP are accepted. - # This ensures that the specified simulation acceptance rate is not affected. - benchmark_mode: bool = False - + def __init__( + self, + args, + ): + # speculative method, choose in [None, "ngram_match", "mtp"] + self.method: Optional[str] = None + # the max length of speculative tokens + self.num_speculative_tokens: int = 1 + # the max length of candidate tokens for speculative method + self.max_candidate_len: int = 5 + # the max length of verify window for speculative method + self.verify_window: int = 2 + # ngram match + self.max_ngram_size: int = 5 + # model for mtp/eagle/draft_model + self.model_name_or_path: Optional[str] = None + # quantization of model + self.quantization: Optional[str] = None + # allocate more blocks to prevent mtp from finishing the block earlier than the main model + # Fixed now + self.num_gpu_block_expand_ratio: Optional[float] = 1 + # To distinguish the main model and draft model(mtp/eagle/draftmodel) + # ["main", "mtp"] + self.model_type: Optional[str] = "main" + # TODO(liuzichang): To reduce memory usage, MTP shares the main model's lm_head and embedding layers. + # A trick method is currently used to enable this sharing. + # This will be replaced with a more standardized solution in the future. + self.sharing_model = None + # During benchmarking, we need to enforce that the number of accepted tokens is 1. + # This means no tokens from MTP are accepted. + # This ensures that the specified simulation acceptance rate is not affected. + self.benchmark_mode: bool = False + + #TODO(YuanRisheng): The name of the server args is different from the name of the SpeculativeConfig. + #We temperately add the name map here and will delete it in future. + name_map = {"speculative_method": "method", + "speculative_max_draft_token_num": "num_speculative_tokens", + "speculative_model_name_or_path": "model_name_or_path", + "speculative_model_quantization": "quantization", + "speculative_benchmark_mode": "benchmark_mode"} + + for key, value in args.items(): + if key in name_map.keys() and hasattr(self, name_map[key]): + setattr(self, name_map[key], value) -@dataclass class DeviceConfig: """ Configuration for device settings. """ - device_type = "cuda" - + def __init__( + self, + args, + ): + self.device_type = "cuda" + for key, value in args.items(): + if hasattr(self, key): + setattr(self, key, value) class GraphOptimizationConfig: - """The Top-level graph optimization contral corresponds to different backends. - - 0: dyncmic graph - - 1: static graph - - 2: static graph + cinn compilation backend - """ - graph_opt_level: int = 0 - - # CUDA Graph Config - """ Whether to use cudagraph. - - False: cudagraph is not used. - - True: cudagraph is used. - It requires that all input buffers have fixed addresses, and all - splitting ops write their outputs to input buffers. - - With dyncmic graph backend: ... - - With static grpah backend: WIP - """ - use_cudagraph: bool = False - """Sizes to capture cudagraph. - - None (default): capture sizes are inferred from llm config. - - list[int]: capture sizes are specified as given.""" - cudagraph_capture_sizes: Optional[list[int]] = None - """ Number of warmup runs for cudagraph. """ - cudagraph_num_of_warmups: int = 2 - """Whether to copy input tensors for cudagraph. - If the caller can guarantee that the same input buffers - are always used, it can set this to False. Otherwise, it should - set this to True.""" - cudagraph_copy_inputs: bool = False - """ In static graph, this is an operation list that does not need to be captured by the CUDA graph. - CudaGraphBackend will split these operations from the static graph. - Example usage: - cudagraph_splitting_ops = ["paddle.unified_attention"] - - Note: If want to use subgraph capture functionality in a dynamic graph, - can manually split the model into multiple layers and apply the @support_cuda_graph decorator - only to the layer where CUDA graph functionality is required. - """ - cudagraph_splitting_ops = Optional[list[str]] - """"whether to use a full cuda graph for the entire forward pass rather than - splitting certain operations such as attention into subgraphs. - Thus this flag cannot be used together with splitting_ops.""" - full_cuda_graph: bool = False - - max_capture_size: int = field(default=None, init=False) # type: ignore - batch_size_to_captured_size: dict[int, - int] = field(default=None, - init=False) # type: ignore - - # CINN Config ... - def init_with_cudagrpah_size(self, cudagraph_capture_sizes: list[int]) -> None: """To complete the initialization of config, @@ -338,18 +293,67 @@ def init_with_cudagrpah_size(self, def __init__(self, enable_static_graph_inference: bool = False, - use_cudagraph: bool = False, - max_capture_batch_size: int = 64): - """ """ + max_capture_batch_size: int = 64, + args = None): + """The Top-level graph optimization contral corresponds to different backends. + - 0: dyncmic graph + - 1: static graph + - 2: static graph + cinn compilation backend + """ + self.graph_opt_level: int = 0 + + # CUDA Graph Config + """ Whether to use cudagraph. + - False: cudagraph is not used. + - True: cudagraph is used. + It requires that all input buffers have fixed addresses, and all + splitting ops write their outputs to input buffers. + - With dyncmic graph backend: ... + - With static grpah backend: WIP + """ + self.use_cudagraph: bool = False + """Sizes to capture cudagraph. + - None (default): capture sizes are inferred from llm config. + - list[int]: capture sizes are specified as given.""" + self.cudagraph_capture_sizes: Optional[list[int]] = None + """ Number of warmup runs for cudagraph. """ + self.cudagraph_num_of_warmups: int = 2 + """Whether to copy input tensors for cudagraph. + If the caller can guarantee that the same input buffers + are always used, it can set this to False. Otherwise, it should + set this to True.""" + self.cudagraph_copy_inputs: bool = False + """ In static graph, this is an operation list that does not need to be captured by the CUDA graph. + CudaGraphBackend will split these operations from the static graph. + Example usage: + cudagraph_splitting_ops = ["paddle.unified_attention"] + + Note: If want to use subgraph capture functionality in a dynamic graph, + can manually split the model into multiple layers and apply the @support_cuda_graph decorator + only to the layer where CUDA graph functionality is required. + """ + self.cudagraph_splitting_ops = Optional[list[str]] + """"whether to use a full cuda graph for the entire forward pass rather than + splitting certain operations such as attention into subgraphs. + Thus this flag cannot be used together with splitting_ops.""" + self.full_cuda_graph: bool = False + + self.max_capture_size: int = field(default=None, init=False) # type: ignore + self.batch_size_to_captured_size: dict[int, + int] = field(default=None, + init=False) # type: ignore + + # CINN Config ... + + for key, value in args.items(): + if hasattr(self, key): + setattr(self, key, value) capture_size = [i for i in range(1, max_capture_batch_size + 1)] self.init_with_cudagrpah_size(cudagraph_capture_sizes=capture_size) - self.use_cudagraph = use_cudagraph #TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn if enable_static_graph_inference: self.graph_opt_level = 1 - -@dataclass class LoadConfig: """ Configuration for dynamic weight loading strategies @@ -363,37 +367,39 @@ class LoadConfig: - 'meta': provide RL traing worker, no_weights_load - None: No dynamic loading """ - use_fastsafetensor: bool = False - dynamic_load_weight: bool = False - load_strategy: Optional[Literal['ipc', 'ipc_no_reshard', 'ipc_snapshot', 'meta']] = None - - def __post_init__(self): - if self.load_strategy is not None and not self.dynamic_load_weight: - raise ValueError("Load strategy requires dynamic_load_weight=True") - - if self.dynamic_load_weight and self.load_strategy is None: - raise ValueError("Must specify load_strategy when dynamic_load_weight is True") - + def __init__( + self, + args, + ): + self.use_fastsafetensor = int(envs.FD_USE_FASTSAFETENSOR) == 1 + self.dynamic_load_weight: bool = False + self.load_strategy: Optional[Literal['ipc', 'ipc_no_reshard', 'ipc_snapshot', 'meta']] = None + for key, value in args.items(): + if hasattr(self, key): + setattr(self, key, value) -@dataclass class LoRAConfig: """ LoRA Config """ pass -@dataclass class KVCacheConfig: """ KV Cache Config """ cache_quant_dtype: str = "none" -@dataclass class DecodingConfig: """ Configuration for decoding """ - pad_token_id = None - + def __init__( + self, + args, + ): + self.pad_token_id = None + for key, value in args.items(): + if hasattr(self, key): + setattr(self, key, value) @dataclass class FDConfig: @@ -411,7 +417,6 @@ class FDConfig: load_config: LoadConfig = field(default=None, init=True) quant_config: Optional[QuantConfigBase] = None graph_opt_config: Optional[GraphOptimizationConfig] = None - moe_config: MoEConfig = field(default=None, init=True) # type: ignore decoding_config: DecodingConfig = field(default=None, init=True) # type: ignore kv_cache_config: KVCacheConfig = field(default=None, diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py index 7821f95acf..a8d2124ae9 100644 --- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py @@ -95,7 +95,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, self.kv_num_heads: int = kv_num_heads self.num_heads: int = num_heads self.head_dim: int = fd_config.model_config.head_dim - self.num_layers: int = fd_config.model_config.num_layers + self.num_layers: int = fd_config.model_config.num_hidden_layers self.max_partition_size: int = int( os.getenv("FLAGS_max_partition_size", 32768)) diff --git a/fastdeploy/model_executor/layers/attention/attention.py b/fastdeploy/model_executor/layers/attention/attention.py index 9597ca58f7..457e5d5215 100644 --- a/fastdeploy/model_executor/layers/attention/attention.py +++ b/fastdeploy/model_executor/layers/attention/attention.py @@ -67,10 +67,10 @@ def __init__( ValueError: If the `v_head_dim` is less than 0. """ super().__init__() - self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_degree + self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_size self.head_dim: int = fd_config.model_config.head_dim self.kv_num_heads: int = \ - max(1, fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_degree) + max(1, fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_size) self.layer_id: int = layer_id self.v_head_dim: int = v_head_dim if v_head_dim > 0 else self.head_dim self.rope_type: str = rope_type diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py index d5ff65b8f7..d78b444d21 100644 --- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py @@ -96,7 +96,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, self.head_dim = fd_config.model_config.head_dim self.hidden_size = fd_config.model_config.hidden_size self.block_size = fd_config.parallel_config.block_size - self.num_layers: int = fd_config.model_config.num_layers + self.num_layers: int = fd_config.model_config.num_hidden_layers self.speculative_method = fd_config.speculative_config.method self.use_speculate = self.speculative_method is not None diff --git a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py index 3393423136..08795a881b 100644 --- a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py @@ -102,7 +102,7 @@ def __init__(self, llm_config: FDConfig, kv_num_heads: int, num_heads: int, self.head_dim = head_dim # note: scale need to change if using MLA self.attention_metadata.scale = 1.0 / sqrt(head_dim) - self.num_layers = llm_config.model_config.num_layers + self.num_layers = llm_config.model_config.num_hidden_layers self.record_block_table_metadata = {} self.only_use_flash_attn = int( os.getenv("FD_ILUVATAR_ONLY_USE_FLASH_ATTN", 0)) == 1 diff --git a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py index b88d98756e..a29d5fe68f 100644 --- a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py +++ b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py @@ -113,18 +113,18 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, self.kv_num_heads: int = kv_num_heads self.num_heads: int = num_heads self.head_dim: int = fd_config.model_config.head_dim - self.num_layers: int = fd_config.model_config.num_layers + self.num_layers: int = fd_config.model_config.num_hidden_layers # For Multi Head Latent Attention - self.kv_lora_rank: int = fd_config.model_config.deepseekv3.kv_lora_rank - self.qk_rope_head_dim: int = fd_config.model_config.deepseekv3.qk_rope_head_dim - self.qk_head_dim: int = fd_config.model_config.deepseekv3.qk_nope_head_dim \ - + fd_config.model_config.deepseekv3.qk_rope_head_dim + self.kv_lora_rank: int = fd_config.model_config.kv_lora_rank + self.qk_rope_head_dim: int = fd_config.model_config.qk_rope_head_dim + self.qk_head_dim: int = fd_config.model_config.qk_nope_head_dim \ + + fd_config.model_config.qk_rope_head_dim self.attn_softmax_scale: float = self.qk_head_dim**-0.5 - if fd_config.model_config.deepseekv3.rope_scaling: - mscale_all_dim = fd_config.model_config.deepseekv3.rope_scaling.get( + if fd_config.model_config.rope_scaling: + mscale_all_dim = fd_config.model_config.rope_scaling.get( "mscale_all_dim", False) # 1.0 - scaling_factor = fd_config.model_config.deepseekv3.rope_scaling[ + scaling_factor = fd_config.model_config.rope_scaling[ "factor"] # 40 mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.attn_softmax_scale = self.attn_softmax_scale * mscale * mscale diff --git a/fastdeploy/model_executor/layers/attention/utils.py b/fastdeploy/model_executor/layers/attention/utils.py index 1ba93e3bbd..ab0923630c 100644 --- a/fastdeploy/model_executor/layers/attention/utils.py +++ b/fastdeploy/model_executor/layers/attention/utils.py @@ -22,7 +22,7 @@ def init_rank_and_device_id(fd_config: FDConfig): """ rank = (fd_config.parallel_config.expert_parallel_rank * - fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank) + fd_config.parallel_config.tensor_parallel_size + fd_config.parallel_config.tensor_parallel_rank) cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None) diff --git a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py index c95bfd6715..6c3cade149 100644 --- a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py @@ -95,7 +95,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, self.kv_num_heads: int = kv_num_heads self.num_heads: int = num_heads self.head_dim: int = head_dim - self.num_layers: int = fd_config.model_config.num_layers + self.num_layers: int = fd_config.model_config.num_hidden_layers # pd_disaggregation self.use_pd_disaggregation: int = int( diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py index b1a883c41d..00032e26fd 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py @@ -88,7 +88,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, self.num_heads = num_heads self.head_dim = head_dim self.scaling = 1.0 / (self.head_dim**0.5) - self.num_layers = fd_config.model_config.num_layers + self.num_layers = fd_config.model_config.num_hidden_layers self.position_ids_base = paddle.arange(self.max_seq_len) # TODO(zhengjun): Need to adapt the allocation logic and diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py index 29174ddda0..74a726a4f1 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py @@ -88,7 +88,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, self.num_heads = num_heads self.head_dim = head_dim self.scaling = 1.0 / (self.head_dim**0.5) - self.num_layers = fd_config.model_config.num_layers + self.num_layers = fd_config.model_config.num_hidden_layers self.position_ids_base = paddle.arange(self.max_seq_len) # TODO(zhengjun): Need to adapt the allocation logic and diff --git a/fastdeploy/model_executor/layers/embeddings.py b/fastdeploy/model_executor/layers/embeddings.py index bc67cb1333..a0fb4fcc42 100644 --- a/fastdeploy/model_executor/layers/embeddings.py +++ b/fastdeploy/model_executor/layers/embeddings.py @@ -59,13 +59,11 @@ def __init__( self.world_size: int = hcg.get_model_parallel_world_size() self.ring_id: int = hcg.get_model_parallel_group().id self.use_rope: bool = fd_config.model_config.use_rope - self.rope_head_dim: int = fd_config.model_config.rope_head_dim self.use_ep: bool = fd_config.parallel_config.use_ep self.hidden_dropout_prob: float = fd_config.model_config.hidden_dropout_prob self.initializer_range: float = fd_config.model_config.initializer_range self.sequence_parallel: bool = fd_config.parallel_config.sequence_parallel self.max_position_embeddings: int = fd_config.model_config.max_position_embeddings - self.freeze_embedding: bool = fd_config.model_config.freeze_embedding self.tie_word_embeddings: bool = fd_config.model_config.tie_word_embeddings self.params_dtype: str = params_dtype @@ -104,15 +102,7 @@ def __init__( ) self.prefix = prefix - - if self.freeze_embedding: - self.word_embeddings.weight.learning_rate = 0.0 - if not self.use_rope: - self.position_embeddings.weight.learning_rate = 0.0 - self.dropout = nn.Dropout(self.hidden_dropout_prob) - self.rope_head_dim_shape_tensor = paddle.ones((self.rope_head_dim), - dtype="int8") def load_state_dict(self, state_dict: Dict[str, paddle.Tensor | np.ndarray]): @@ -122,6 +112,7 @@ def load_state_dict(self, state_dict: Dict[str, Args: state_dict (dict): A dictionary containing the checkpoint weights and biases. """ + a = state_dict[self.prefix + ".weight"] if self.tie_word_embeddings: self.word_embeddings.weight.set_value( get_tensor(state_dict[self.prefix + ".weight"]).astype( diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index ee0c930e01..813489d577 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -266,7 +266,7 @@ def __init__( with_bias=with_bias, add_bias=add_bias, skip_quant=skip_quant) - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size self.input_size = input_size self.output_size = divide( output_size, @@ -348,7 +348,7 @@ def __init__( """ self.activation = activation self.hidden_size = fd_config.model_config.hidden_size - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size super().__init__(fd_config=fd_config, prefix=prefix, @@ -410,7 +410,7 @@ def __init__(self, fd_config, prefix, with_bias=False, add_bias=True): self.kv_num_heads = fd_config.model_config.num_key_value_heads self.hidden_size = fd_config.model_config.hidden_size self.head_dim = fd_config.model_config.head_dim - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size self.num_heads_per_rank = divide(self.num_heads, self.nranks) if self.kv_num_heads < self.nranks and self.nranks % self.kv_num_heads == 0: self.kv_num_heads_per_rank = 1 @@ -545,7 +545,7 @@ def __init__( skip_quant=skip_quant) self.fd_config = fd_config self.skip_quant = False - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size self.hidden_size = fd_config.model_config.hidden_size self.head_dim = fd_config.model_config.head_dim self.num_heads = fd_config.model_config.num_attention_heads // self.nranks @@ -638,7 +638,7 @@ def __init__( with_bias (bool): Whether to include bias or not. Defaults to False. skip_quant (bool): Whether to skip quantization. Defaults to False. """ - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size self.kv_lora_rank = kv_lora_rank self.num_attention_heads = num_attention_heads self.qk_nope_head_dim = qk_nope_head_dim diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py index 3da7b783e4..d06b14e1bd 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py @@ -49,7 +49,7 @@ def init_ep(self, layer: nn.Layer) -> None: from .ep import EPDecoderRunner self.ep_decoder_runner = EPDecoderRunner( layer.top_k, layer.hidden_size, layer.num_experts, - layer.moe_config.num_max_dispatch_tokens_per_rank, + layer.model_config.num_max_dispatch_tokens_per_rank, layer.ep_size, layer.ep_rank) else: from .ep import EPPrefillRunner diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index dc01f17144..14301aa445 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -14,7 +14,6 @@ # limitations under the License. """ -import numpy as np import paddle from paddle import nn from paddleformers.utils.log import logger @@ -23,8 +22,8 @@ import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm from fastdeploy.distributed.communication_op import \ tensor_model_parallel_all_reduce -from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func from fastdeploy.model_executor.layers.utils import get_tensor +from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func from ..utils import create_and_set_parameter from .fused_moe_backend_base import MoEMethodBase @@ -242,7 +241,7 @@ def apply_ep_decode( [ layer.num_local_experts, layer.ep_size * - layer.moe_config.num_max_dispatch_tokens_per_rank, + layer.model_config.num_max_dispatch_tokens_per_rank, layer.moe_intermediate_size * 2, ], dtype=paddle.bfloat16, @@ -252,7 +251,7 @@ def apply_ep_decode( [ layer.num_local_experts, layer.ep_size * - layer.moe_config.num_max_dispatch_tokens_per_rank, + layer.model_config.num_max_dispatch_tokens_per_rank, layer.hidden_size, ], dtype=paddle.bfloat16, diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index ae3ab4f8c1..2a6a8b4a17 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -72,8 +72,8 @@ def __init__( self.layer_idx = layer_idx self.reduce_results = reduce_results - self.tp_size = fd_config.parallel_config.tensor_parallel_degree - self.ep_size = fd_config.parallel_config.expert_parallel_degree + self.tp_size = fd_config.parallel_config.tensor_parallel_size + self.ep_size = fd_config.parallel_config.expert_parallel_size self.ep_rank = fd_config.parallel_config.expert_parallel_rank assert (self.tp_size >= 1 and self.ep_size == 1) or \ @@ -81,7 +81,6 @@ def __init__( 'MoE only support parallelism on TP or EP dimension.' self.hidden_size = fd_config.model_config.hidden_size - self.moe_config = fd_config.moe_config self.num_experts = num_experts self.num_local_experts = self.num_experts // self.ep_size @@ -141,7 +140,7 @@ def init_moe_weights(self): shape=gate_weight_shape, dtype="float32", ) - if self.moe_config.moe_use_aux_free: + if self.model_config.moe_use_aux_free: self.gate_correction_bias = self.create_parameter( shape=gate_correction_bias_shape, dtype="float32", diff --git a/fastdeploy/model_executor/load_weight_utils.py b/fastdeploy/model_executor/load_weight_utils.py index 0129052492..c9fcbd086d 100644 --- a/fastdeploy/model_executor/load_weight_utils.py +++ b/fastdeploy/model_executor/load_weight_utils.py @@ -43,7 +43,7 @@ def load_ep_checkpoint(model_path: str, filtered_map = {k: v for k, v in weight_list.items() if "experts" not in k} num_local_ffn_keys = [] - for i in range(config.moe_layer_start_index, config.num_layers): + for i in range(config.moe_layer_start_index, config.num_hidden_layers): for j in range( config.num_experts_start_offset, config.num_experts_start_offset + config.num_experts_per_rank, @@ -261,7 +261,7 @@ def load_composite_checkpoint( and os.path.isdir(os.path.join(model_path, f)) ] if len(rank_dirs) > 1: - if fd_config.parallel_config.tensor_parallel_degree != len( + if fd_config.parallel_config.tensor_parallel_size != len( rank_dirs): raise ValueError( f"Your model only supports loading with tp{len(rank_dirs)}" @@ -283,7 +283,7 @@ def load_composite_checkpoint( else: state_dict = load_tp_checkpoint(model_path, cls, - fd_config.model_config, + fd_config.model_config.pretrained_config, return_numpy=return_numpy) if not state_dict: raise ValueError("weight not found in state_dict !") diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py index dc1d56f9cd..a9ac1d50b4 100644 --- a/fastdeploy/model_executor/models/deepseek_v3.py +++ b/fastdeploy/model_executor/models/deepseek_v3.py @@ -27,6 +27,7 @@ from fastdeploy.config import FDConfig from fastdeploy.distributed.communication_op import \ tensor_model_parallel_all_reduce +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.layers.activation import SiluAndMul from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding @@ -40,7 +41,6 @@ DeepseekScalingRotaryEmbedding from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.platforms import current_platform -from fastdeploy.model_executor.forward_meta import ForwardMeta if current_platform.is_cuda(): from fastdeploy.model_executor.ops.gpu import \ @@ -109,7 +109,7 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str) -> None: super().__init__() - self.tp_size = fd_config.parallel_config.tensor_parallel_degree + self.tp_size = fd_config.parallel_config.tensor_parallel_size weight_key_map = { "gate_weight_key": f"{prefix}.gate.weight", @@ -124,23 +124,23 @@ def __init__(self, fd_config: FDConfig, layer_id: int, self.fused_moe = FusedMoE( fd_config=fd_config, reduce_results=False, - moe_intermediate_size=fd_config.model_config.deepseekv3. + moe_intermediate_size=fd_config.model_config. moe_intermediate_size, - num_experts=fd_config.model_config.deepseekv3.n_routed_experts, - top_k=fd_config.model_config.deepseekv3.num_experts_per_tok, - topk_method=fd_config.model_config.deepseekv3.topk_method, - topk_group=fd_config.model_config.deepseekv3.topk_group, - n_group=fd_config.model_config.deepseekv3.n_group, - routed_scaling_factor=fd_config.model_config.deepseekv3. + num_experts=fd_config.model_config.n_routed_experts, + top_k=fd_config.model_config.num_experts_per_tok, + topk_method=fd_config.model_config.topk_method, + topk_group=fd_config.model_config.topk_group, + n_group=fd_config.model_config.n_group, + routed_scaling_factor=fd_config.model_config. routed_scaling_factor, layer_idx=layer_id, weight_key_map=weight_key_map, ) - self.num_shared_experts = fd_config.model_config.deepseekv3.n_shared_experts + self.num_shared_experts = fd_config.model_config.n_shared_experts shared_experts_intermediate_size = ( self.num_shared_experts * - fd_config.model_config.deepseekv3.moe_intermediate_size) + fd_config.model_config.moe_intermediate_size) self.shared_experts = DeepSeekV3MLP( fd_config=fd_config, @@ -178,18 +178,18 @@ def __init__(self, prefix: str = "") -> None: super().__init__() - self.tp_size = fd_config.parallel_config.tensor_parallel_degree + self.tp_size = fd_config.parallel_config.tensor_parallel_size self.hidden_size = fd_config.model_config.hidden_size self.num_attention_heads = fd_config.model_config.num_attention_heads self.num_attention_heads_tp = self.num_attention_heads // self.tp_size # MLA - self.qk_nope_head_dim = fd_config.model_config.deepseekv3.qk_nope_head_dim - self.qk_rope_head_dim = fd_config.model_config.deepseekv3.qk_rope_head_dim + self.qk_nope_head_dim = fd_config.model_config.qk_nope_head_dim + self.qk_rope_head_dim = fd_config.model_config.qk_rope_head_dim self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim - self.v_head_dim = fd_config.model_config.deepseekv3.v_head_dim - self.q_lora_rank = fd_config.model_config.deepseekv3.q_lora_rank - self.kv_lora_rank = fd_config.model_config.deepseekv3.kv_lora_rank + self.v_head_dim = fd_config.model_config.v_head_dim + self.q_lora_rank = fd_config.model_config.q_lora_rank + self.kv_lora_rank = fd_config.model_config.kv_lora_rank self.attn_softmax_scale = self.qk_head_dim**-0.5 self.rope_theta = fd_config.model_config.rope_theta @@ -255,7 +255,7 @@ def __init__(self, qk_nope_head_dim=self.qk_nope_head_dim, v_head_dim=self.v_head_dim) - self.rope_scaling = fd_config.model_config.deepseekv3.rope_scaling + self.rope_scaling = fd_config.model_config.rope_scaling if self.rope_scaling: mscale_all_dim = self.rope_scaling.get("mscale_all_dim", False) scaling_factor = self.rope_scaling["factor"] @@ -449,9 +449,9 @@ def __init__( prefix=f"{prefix}.self_attn", ) - if (fd_config.model_config.deepseekv3.n_routed_experts is not None + if (fd_config.model_config.n_routed_experts is not None and layer_id - >= fd_config.model_config.deepseekv3.first_k_dense_replace): + >= fd_config.model_config.first_k_dense_replace): self.mlp = DeepSeekV3MoE( fd_config=fd_config, layer_id=layer_id, @@ -525,8 +525,8 @@ def __init__( Initializer for the DeepSeekV3Model class. """ super().__init__() - self.num_layers = fd_config.model_config.num_layers - fd_config.model_config.prefix_name = "deepseek_v3" + self.num_layers = fd_config.model_config.num_hidden_layers + fd_config.model_config.pretrained_config.prefix_name = "deepseek_v3" self.embeddings = VocabParallelEmbedding( fd_config, @@ -539,7 +539,7 @@ def __init__( self.decoder_layers = nn.LayerList([ DeepSeekV3DecoderLayer( fd_config, - prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") for i in range(self.num_layers) ]) @@ -755,5 +755,5 @@ def get_tensor_parallel_split_mappings(num_layers): return final_actions - mappings = get_tensor_parallel_split_mappings(config.num_layers) + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers) return mappings diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index d1fef50d08..4ae0b3c18d 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -25,7 +25,7 @@ from paddleformers.transformers import PretrainedModel from paddleformers.utils.log import logger -from fastdeploy.config import FDConfig, ModelConfig +from fastdeploy.config import FDConfig from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.decorator import \ support_graph_optimization @@ -54,7 +54,7 @@ def __init__( reduce_results: bool = True, ) -> None: super().__init__() - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size self.gate_up_proj = MergedColumnParallelLinear( fd_config=fd_config, prefix=f"{prefix}.up_gate_proj", @@ -179,16 +179,16 @@ def __init__(self, fd_config: FDConfig, layer_id: int, self.fused_moe = FusedMoE( fd_config=fd_config, - moe_intermediate_size=fd_config.moe_config.moe_intermediate_size, - num_experts=fd_config.moe_config.num_experts, - top_k=fd_config.moe_config.top_k, + moe_intermediate_size=fd_config.model_config.moe_intermediate_size, + num_experts=fd_config.model_config.moe_num_experts, + top_k=fd_config.model_config.moe_k, layer_idx=layer_id, weight_key_map=weight_key_map, ) - self.num_shared_experts = fd_config.moe_config.moe_num_shared_experts + self.num_shared_experts = fd_config.model_config.moe_num_shared_experts if self.num_shared_experts > 0: - shared_experts_hidden_dim = self.num_shared_experts * fd_config.moe_config.moe_intermediate_size + shared_experts_hidden_dim = self.num_shared_experts * fd_config.model_config.moe_intermediate_size self.shared_experts = Ernie4_5_MLP( fd_config=fd_config, intermediate_size=shared_experts_hidden_dim, @@ -271,8 +271,8 @@ def __init__( prefix=f"{prefix}.self_attn", ) - if (fd_config.moe_config.num_experts is not None - and layer_id >= fd_config.moe_config.moe_layer_start_index): + if (fd_config.model_config.moe_num_experts is not None + and layer_id >= fd_config.model_config.moe_layer_start_index): self.mlp = Ernie4_5_MoE( fd_config=fd_config, layer_id=layer_id, @@ -281,7 +281,7 @@ def __init__( else: self.mlp = Ernie4_5_MLP( fd_config=fd_config, - intermediate_size=fd_config.model_config.ffn_hidden_size, + intermediate_size=fd_config.model_config.intermediate_size, prefix=f"{prefix}.mlp", ) @@ -346,20 +346,20 @@ def __init__( """ super().__init__() - self.num_layers = fd_config.model_config.num_layers - fd_config.model_config.prefix_name = "ernie" + self.num_layers = fd_config.model_config.num_hidden_layers + fd_config.model_config.pretrained_config.prefix_name = "ernie" self.embeddings = VocabParallelEmbedding( fd_config=fd_config, num_embeddings=fd_config.model_config.vocab_size, embedding_dim=fd_config.model_config.hidden_size, params_dtype=paddle.get_default_dtype(), - prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens")) + prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens")) self.hidden_layers = nn.LayerList([ Ernie4_5_DecoderLayer( fd_config=fd_config, - prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") for i in range(self.num_layers) ]) @@ -367,7 +367,7 @@ def __init__( fd_config, hidden_size=fd_config.model_config.hidden_size, eps=fd_config.model_config.rms_norm_eps, - prefix=f"{fd_config.model_config.prefix_name}.norm", + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm", ) def load_state_dict(self, state_dict): @@ -466,8 +466,8 @@ def empty_input_forward(self): shape=[0, self.fd_config.model_config.hidden_size], dtype=paddle.get_default_dtype(), ) - for i in range(self.fd_config.moe_config.moe_layer_start_index, - self.fd_config.model_config.num_layers): + for i in range(self.fd_config.model_config.moe_layer_start_index, + self.fd_config.model_config.num_hidden_layers): self.model.hidden_layers[i].mlp.fused_moe(fake_hidden_states) def forward( @@ -559,7 +559,7 @@ def _init_weight(self, layer): ] @classmethod - def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): + def _get_tensor_parallel_mappings(cls, config, is_split=True): """ get_tensor_parallel_mappings """ @@ -603,7 +603,7 @@ def get_tensor_parallel_split_mappings(num_layers, moe_num_experts, ) return final_actions mappings = get_tensor_parallel_split_mappings( - config.num_layers, + config.num_hidden_layers, config.moe_num_experts, config.moe_layer_start_index, config.prefix_name, diff --git a/fastdeploy/model_executor/models/ernie4_5_mtp.py b/fastdeploy/model_executor/models/ernie4_5_mtp.py index 6b3b6ff155..02a711c949 100644 --- a/fastdeploy/model_executor/models/ernie4_5_mtp.py +++ b/fastdeploy/model_executor/models/ernie4_5_mtp.py @@ -25,12 +25,12 @@ from paddleformers.transformers import PretrainedModel from paddleformers.utils.log import logger -from fastdeploy.config import FDConfig, ModelConfig +from fastdeploy.config import FDConfig +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.layers.mtp_linear import ParallelEHProjection from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_DecoderLayer from fastdeploy.model_executor.models.model_base import ModelForCasualLM -from fastdeploy.model_executor.forward_meta import ForwardMeta class Ernie4_5_MTPPretrainedModel(PretrainedModel): @@ -47,7 +47,7 @@ def _init_weight(self, layer): return None @classmethod - def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): + def _get_tensor_parallel_mappings(cls, config, is_split=True): """ get_tensor_parallel_mappings """ @@ -237,7 +237,7 @@ def get_tensor_parallel_split_mappings(num_layers, moe_num_experts, moe_num_experts = 0 mappings = get_tensor_parallel_split_mappings( - config.num_layers, + config.num_hidden_layers, moe_num_experts, config.moe_layer_start_index, ) @@ -262,13 +262,13 @@ def __init__( """ super().__init__() - self.num_layers = fd_config.model_config.num_layers + self.num_layers = fd_config.model_config.num_hidden_layers self.embeddings = fd_config.speculative_config.sharing_model.model.embeddings self.hidden_layers = nn.LayerList([ Ernie4_5_DecoderLayer( fd_config=fd_config, - prefix=f"{fd_config.model_config.prefix_name}.{i}") + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.{i}") for i in range(self.num_layers) ]) @@ -398,8 +398,8 @@ def empty_input_forward(self): shape=[0, self.fd_config.model_config.hidden_size], dtype=paddle.get_default_dtype(), ) - for i in range(self.fd_config.moe_config.moe_layer_start_index, - self.fd_config.model_config.num_layers): + for i in range(self.fd_config.model_config.moe_layer_start_index, + self.fd_config.model_config.num_hidden_layers): self.model.hidden_layers[i].mlp.fused_moe(fake_hidden_states) def forward( diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/configuration.py b/fastdeploy/model_executor/models/ernie4_5_vl/configuration.py deleted file mode 100644 index f25742d3c2..0000000000 --- a/fastdeploy/model_executor/models/ernie4_5_vl/configuration.py +++ /dev/null @@ -1,167 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import copy - -from fastdeploy.config import ModelConfig - -from .dfnrope.modeling import DFNRopeVisionTransformerConfig - -__all__ = [ - "Ernie4_5_VLMoeConfig", -] - - -class Ernie4_5_VLMoeConfig(ModelConfig): - r""" - This is the configuration class to store the configuration of a [`~ErnieModel`]. It is used to instantiate an Ernie - model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the Ernie-7B. - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - Args: - vocab_size (`int`, *optional*, defaults to 32000): - Vocabulary size of the Ernie model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`~ErnieModel`] or [`~TFErnieModel`]. - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 11008): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer encoder. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-12): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - tie_word_embeddings(`bool`, *optional*, defaults to `False`): - Whether to tie weight embeddings - Example: - ```python - >>> from paddleformers.transformer import ErnieModel, ErnieConfig - - >>> # Initializing a Ernie ernie-7b style configuration - >>> configuration = ErnieConfig() - - >>> # Initializing a model from the ernie-7b style configuration - >>> model = ErnieModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "erniemoevl" - attribute_map = { - "n_positions": "max_position_embeddings", - "n_embd": "hidden_size", - "n_layer": "num_hidden_layers", - "n_head": "num_attention_heads", - "n_inner": "intermediate_size", - "activation_function": "hidden_act", - } - - def __init__( - self, - vision_config=None, - im_patch_id=None, - pixel_hidden_size=None, # None for fuyu - modality_detach=False, - temporal_conv_size=2, - spatial_conv_size=2, - mm_vocab_size=0, # vocab for mm specialtokens - max_text_id=None, - use_temporal_conv=True, - moe_use_size_all2all=False, - moe_num_attn_experts=False, - moe_dense_experts_token_type_id: int = 3, - moe_use_hard_gate: bool = True, - moe_fuse_experts: bool = False, - moe_use_token_type_bias: bool = False, - disable_ffn_model_parallel=False, - fuse_attn_ffn=True, - rope_3d=True, - freq_allocation=20, - using_precision_check=False, - use_recompute_resampler=False, - resampler_fuse_rms_norm=False, - moe_layer_feed_fake_token=False, - moe_num_experts=0, - **kwargs, - ): - super().__init__(**kwargs) - self.vision_config = DFNRopeVisionTransformerConfig( - **vision_config) if vision_config else None - self.im_patch_id = im_patch_id - self.pixel_hidden_size = pixel_hidden_size - self.modality_detach = modality_detach - self.temporal_conv_size = temporal_conv_size - self.spatial_conv_size = spatial_conv_size - self.mm_vocab_size = mm_vocab_size - self.max_text_id = max_text_id - self.use_temporal_conv = use_temporal_conv - - self.moe_use_size_all2all = moe_use_size_all2all - self.moe_num_attn_experts = moe_num_attn_experts - self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id - self.moe_use_hard_gate = moe_use_hard_gate - self.moe_fuse_experts = moe_fuse_experts - self.moe_use_token_type_bias = moe_use_token_type_bias - self.disable_ffn_model_parallel = disable_ffn_model_parallel - - self.fuse_attn_ffn = fuse_attn_ffn - self.rope_3d = rope_3d - self.freq_allocation = freq_allocation - self.using_precision_check = using_precision_check - self.use_recompute_resampler = use_recompute_resampler - self.resampler_fuse_rms_norm = resampler_fuse_rms_norm - self.moe_layer_feed_fake_token = moe_layer_feed_fake_token - self.moe_num_experts = moe_num_experts - - @property - def multimodel_experts(self) -> bool: - """是否有多种类型的experts.""" - return isinstance(self.moe_num_experts, - (tuple, list)) and len(self.moe_num_experts) > 1 - - @property - def use_moe(self) -> bool: - """ - Check if model is using MoE architecture. - - Returns: - bool: True if moe_num_experts > 0, False otherwise - """ - return sum( - self.moe_num_experts - ) > 0 if self.multimodel_experts else self.moe_num_experts > 0 - - def to_dict(self, saving_file=False): - """to_dict""" - output = copy.deepcopy(self.__dict__) - if self.vision_config: - output["vision_config"] = ( - self.vision_config.to_diff_dict() if isinstance( - self.vision_config, - (DFNRopeVisionTransformerConfig)) else self.vision_config) - - output["model_type"] = self.__class__.model_type - return output diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index b10b6b03ea..622bf28014 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -72,8 +72,8 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str) -> None: super().__init__() - self.tp_size = fd_config.parallel_config.tensor_parallel_degree - moe_layer_start_index = fd_config.moe_config.moe_layer_start_index + self.tp_size = fd_config.parallel_config.tensor_parallel_size + moe_layer_start_index = fd_config.model_config.moe_layer_start_index if isinstance(moe_layer_start_index, int): text_moe_layer_start_index = moe_layer_start_index image_moe_layer_start_index = moe_layer_start_index @@ -81,10 +81,10 @@ def __init__(self, fd_config: FDConfig, layer_id: int, text_moe_layer_start_index = moe_layer_start_index[0] image_moe_layer_start_index = moe_layer_start_index[1] - moe_layer_end_index = fd_config.moe_config.moe_layer_end_index + moe_layer_end_index = fd_config.model_config.moe_layer_end_index if moe_layer_end_index is None: - text_moe_layer_end_index = fd_config.model_config.num_layers - image_moe_layer_end_index = fd_config.model_config.num_layers + text_moe_layer_end_index = fd_config.model_config.num_hidden_layers + image_moe_layer_end_index = fd_config.model_config.num_hidden_layers elif isinstance(moe_layer_end_index, int): text_moe_layer_end_index = moe_layer_end_index image_moe_layer_end_index = moe_layer_end_index @@ -107,11 +107,11 @@ def __init__(self, fd_config: FDConfig, layer_id: int, self.mlp_text = FusedMoE( fd_config=fd_config, reduce_results=False, - moe_intermediate_size=fd_config.moe_config. + moe_intermediate_size=fd_config.model_config. moe_intermediate_size[0], - num_experts=fd_config.moe_config.num_experts[0], + num_experts=fd_config.model_config.moe_num_experts[0], expert_id_offset=0, - top_k=fd_config.moe_config.top_k, + top_k=fd_config.model_config.moe_k, layer_idx=layer_id, moe_tag="Text", weight_key_map=weight_key_map, @@ -120,7 +120,7 @@ def __init__(self, fd_config: FDConfig, layer_id: int, else: self.mlp_text = Ernie4_5_VLMLP( fd_config=fd_config, - intermediate_size=fd_config.model_config.ffn_hidden_size, + intermediate_size=fd_config.model_config.intermediate_size, prefix=f"{prefix}", ) @@ -139,11 +139,11 @@ def __init__(self, fd_config: FDConfig, layer_id: int, self.mlp_image = FusedMoE( fd_config=fd_config, reduce_results=False, - moe_intermediate_size=fd_config.moe_config. + moe_intermediate_size=fd_config.model_config. moe_intermediate_size[1], - num_experts=fd_config.moe_config.num_experts[1], - expert_id_offset=fd_config.moe_config.num_experts[0], - top_k=fd_config.moe_config.top_k, + num_experts=fd_config.model_config.moe_num_experts[1], + expert_id_offset=fd_config.model_config.moe_num_experts[0], + top_k=fd_config.model_config.moe_k, layer_idx=layer_id, moe_tag="Image", weight_key_map=weight_key_map, @@ -152,16 +152,16 @@ def __init__(self, fd_config: FDConfig, layer_id: int, else: self.mlp_image = Ernie4_5_VLMLP( fd_config=fd_config, - intermediate_size=fd_config.model_config.ffn_hidden_size, + intermediate_size=fd_config.model_config.intermediate_size, prefix=f"{prefix}", ) - self.num_shared_experts = fd_config.moe_config.moe_num_shared_experts + self.num_shared_experts = fd_config.model_config.moe_num_shared_experts if self.num_shared_experts > 0: self.share_experts = Ernie4_5_VLMLP( fd_config=fd_config, intermediate_size=self.num_shared_experts * - fd_config.moe_config.moe_intermediate_size[0], + fd_config.model_config.moe_intermediate_size[0], prefix=f"{prefix}.shared_experts", reduce_results=False, ) @@ -235,15 +235,15 @@ def __init__( super().__init__() layer_id = int(prefix.split(sep='.')[-1]) - moe_layer_start_index = fd_config.moe_config.moe_layer_start_index + moe_layer_start_index = fd_config.model_config.moe_layer_start_index if isinstance(moe_layer_start_index, list): min_moe_layer_start_index = min(moe_layer_start_index) else: min_moe_layer_start_index = moe_layer_start_index - max_moe_layer_end_index = fd_config.model_config.num_layers - if fd_config.moe_config.moe_layer_end_index is not None: - moe_layer_end_index = fd_config.moe_config.moe_layer_end_index + max_moe_layer_end_index = fd_config.model_config.num_hidden_layers + if fd_config.model_config.moe_layer_end_index is not None: + moe_layer_end_index = fd_config.model_config.moe_layer_end_index if isinstance(moe_layer_start_index, list): max_moe_layer_end_index = max(moe_layer_end_index) else: @@ -257,7 +257,7 @@ def __init__( assert min_moe_layer_start_index <= max_moe_layer_end_index - if (fd_config.moe_config.num_experts is not None + if (fd_config.model_config.moe_num_experts is not None and layer_id >= min_moe_layer_start_index and layer_id <= max_moe_layer_end_index): self.mlp = Ernie4_5_VLMoE( @@ -268,7 +268,7 @@ def __init__( else: self.mlp = Ernie4_5_VLMLP( fd_config=fd_config, - intermediate_size=fd_config.model_config.ffn_hidden_size, + intermediate_size=fd_config.model_config.intermediate_size, prefix=f"{prefix}.mlp", ) @@ -337,23 +337,23 @@ def __init__( """ super().__init__() - self.num_layers = fd_config.model_config.num_layers - self.im_patch_id = fd_config.moe_config.im_patch_id + self.num_layers = fd_config.model_config.num_hidden_layers + self.im_patch_id = fd_config.model_config.im_patch_id self._dtype = fd_config.model_config.dtype - fd_config.model_config.prefix_name = "ernie" + fd_config.model_config.pretrained_config.prefix_name = "ernie" self.embeddings = VocabParallelEmbedding( fd_config=fd_config, num_embeddings=fd_config.model_config.vocab_size, embedding_dim=fd_config.model_config.hidden_size, params_dtype=paddle.get_default_dtype, - prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"), + prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"), ) self.hidden_layers = nn.LayerList([ Ernie4_5_VLDecoderLayer( fd_config=fd_config, - prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") for i in range(self.num_layers) ]) @@ -361,7 +361,7 @@ def __init__( fd_config, hidden_size=fd_config.model_config.hidden_size, eps=fd_config.model_config.rms_norm_eps, - prefix=f"{fd_config.model_config.prefix_name}.norm", + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm", ) def load_state_dict(self, state_dict): @@ -748,7 +748,7 @@ def get_vison_parallel_split_mappings(num_layers: int): moe_layer_start_index = config.moe_layer_start_index mappings = get_tensor_parallel_split_mappings( - config.num_layers, + config.num_hidden_layers, config.moe_num_experts, moe_layer_start_index, config.prefix_name, diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py index fbc05899be..4150adb98b 100644 --- a/fastdeploy/model_executor/models/model_base.py +++ b/fastdeploy/model_executor/models/model_base.py @@ -53,7 +53,7 @@ def __init__(self, configs): """ Args: configs (dict): Configurations including parameters such as max_dec_len, min_dec_len, decode_strategy, - ori_vocab_size, use_topp_sampling, etc. + vocab_size, use_topp_sampling, etc. """ super(ModelForCasualLM, self).__init__() self.fd_config = configs diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py index 6695f38549..81e0041079 100644 --- a/fastdeploy/model_executor/models/qwen2.py +++ b/fastdeploy/model_executor/models/qwen2.py @@ -24,6 +24,7 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig, ModelConfig +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.decorator import \ support_graph_optimization from fastdeploy.model_executor.layers.activation import SiluAndMul @@ -34,7 +35,6 @@ from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.models.model_base import ModelForCasualLM -from fastdeploy.model_executor.forward_meta import ForwardMeta class Qwen2MLP(nn.Layer): @@ -47,12 +47,12 @@ def __init__( prefix: str = "", ) -> None: super().__init__() - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size self.gate_up_proj = MergedColumnParallelLinear( fd_config=fd_config, prefix=f"{prefix}.up_gate_proj", input_size=fd_config.model_config.hidden_size, - output_size=fd_config.model_config.ffn_hidden_size * 2, + output_size=fd_config.model_config.intermediate_size * 2, with_bias=False, activation=fd_config.model_config.hidden_act, ) @@ -60,7 +60,7 @@ def __init__( self.down_proj = RowParallelLinear( fd_config=fd_config, prefix=f"{prefix}.down_proj", - input_size=fd_config.model_config.ffn_hidden_size, + input_size=fd_config.model_config.intermediate_size, output_size=fd_config.model_config.hidden_size, with_bias=False, ) @@ -227,21 +227,21 @@ def __init__( """ super().__init__() - self.num_layers = fd_config.model_config.num_layers - fd_config.model_config.prefix_name = "qwen2" + self.num_layers = fd_config.model_config.num_hidden_layers + fd_config.model_config.pretrained_config.prefix_name = "qwen2" self.embeddings = VocabParallelEmbedding( fd_config=fd_config, num_embeddings=fd_config.model_config.vocab_size, embedding_dim=fd_config.model_config.hidden_size, params_dtype=paddle.get_default_dtype, - prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"), + prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"), ) self.layers = nn.LayerList([ Qwen2DecoderLayer( fd_config=fd_config, - prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") for i in range(self.num_layers) ]) @@ -249,7 +249,7 @@ def __init__( fd_config, hidden_size=fd_config.model_config.hidden_size, eps=fd_config.model_config.rms_norm_eps, - prefix=f"{fd_config.model_config.prefix_name}.norm", + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm", ) def load_state_dict(self, state_dict): @@ -427,6 +427,6 @@ def get_tensor_parallel_split_mappings(num_layers): return final_actions - mappings = get_tensor_parallel_split_mappings(config.num_layers) + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers) return mappings diff --git a/fastdeploy/model_executor/models/qwen3.py b/fastdeploy/model_executor/models/qwen3.py index 273688826b..5a75a868ea 100644 --- a/fastdeploy/model_executor/models/qwen3.py +++ b/fastdeploy/model_executor/models/qwen3.py @@ -23,7 +23,8 @@ from paddleformers.transformers import PretrainedModel from paddleformers.utils.log import logger -from fastdeploy.config import FDConfig, ModelConfig +from fastdeploy.config import FDConfig +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.decorator import \ support_graph_optimization from fastdeploy.model_executor.layers.attention.attention import Attention @@ -34,7 +35,6 @@ from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP -from fastdeploy.model_executor.forward_meta import ForwardMeta class Qwen3MLP(Qwen2MLP): @@ -59,7 +59,7 @@ def __init__(self, self.qkv_proj = QKVParallelLinear(fd_config, prefix=f"{prefix}.qkv_proj", with_bias=False) - nranks = fd_config.parallel_config.tensor_parallel_degree + nranks = fd_config.parallel_config.tensor_parallel_size self.o_proj = RowParallelLinear( fd_config, @@ -85,7 +85,7 @@ def __init__(self, prefix=f"{prefix}.k_norm", begin_norm_axis=2) - nranks = fd_config.parallel_config.tensor_parallel_degree + nranks = fd_config.parallel_config.tensor_parallel_size num_kv_heads_replicas = max(1, nranks // fd_config.model_config.num_key_value_heads) self.q_size = fd_config.model_config.num_attention_heads * self.head_dim // nranks self.kv_size = fd_config.model_config.num_key_value_heads * self.head_dim * num_kv_heads_replicas // nranks @@ -163,21 +163,21 @@ def __init__( """ super().__init__() - self.num_layers = fd_config.model_config.num_layers - fd_config.model_config.prefix_name = "model" + self.num_layers = fd_config.model_config.num_hidden_layers + fd_config.model_config.pretrained_config.prefix_name = "model" self.embeddings = VocabParallelEmbedding( fd_config=fd_config, num_embeddings=fd_config.model_config.vocab_size, embedding_dim=fd_config.model_config.hidden_size, params_dtype=paddle.get_default_dtype, - prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"), + prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"), ) self.layers = nn.LayerList([ Qwen3DecoderLayer( fd_config=fd_config, - prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") for i in range(self.num_layers) ]) @@ -185,7 +185,7 @@ def __init__( fd_config, hidden_size=fd_config.model_config.hidden_size, eps=fd_config.model_config.rms_norm_eps, - prefix=f"{fd_config.model_config.prefix_name}.norm", + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm", ) def load_state_dict(self, state_dict): @@ -307,7 +307,7 @@ def _init_weight(self, layer): return None @classmethod - def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): + def _get_tensor_parallel_mappings(cls, config, is_split=True): from paddleformers.transformers.conversion_utils import \ split_or_merge_func @@ -358,5 +358,5 @@ def get_tensor_parallel_split_mappings(num_layers): return final_actions - mappings = get_tensor_parallel_split_mappings(config.num_layers) + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers) return mappings diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 03e4d58567..b222f48abf 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -23,20 +23,19 @@ from paddleformers.transformers import PretrainedModel from paddleformers.utils.log import logger -from fastdeploy.config import FDConfig, ModelConfig +from fastdeploy.config import FDConfig +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.decorator import \ support_graph_optimization from fastdeploy.model_executor.layers.activation import SiluAndMul -from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding from fastdeploy.model_executor.layers.linear import ( - MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) + MergedColumnParallelLinear, RowParallelLinear) from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.moe.moe import FusedMoE from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.qwen3 import Qwen3Attention -from fastdeploy.model_executor.forward_meta import ForwardMeta class Qwen3MLP(nn.Layer): @@ -49,13 +48,13 @@ def __init__( prefix: str = "", ) -> None: super().__init__() - self.nranks = fd_config.parallel_config.tensor_parallel_degree + self.nranks = fd_config.parallel_config.tensor_parallel_size self.gate_up_proj = MergedColumnParallelLinear( fd_config, prefix=f"{prefix}.up_gate_proj", input_size=fd_config.model_config.hidden_size, - output_size=fd_config.model_config.ffn_hidden_size * 2, + output_size=fd_config.model_config.intermediate_size * 2, with_bias=False, activation=fd_config.model_config.hidden_act, ) @@ -63,7 +62,7 @@ def __init__( self.down_proj = RowParallelLinear( fd_config, prefix=f"{prefix}.down_proj", - input_size=fd_config.model_config.ffn_hidden_size, + input_size=fd_config.model_config.intermediate_size, output_size=fd_config.model_config.hidden_size, with_bias=False, ) @@ -115,14 +114,14 @@ def __init__( f"{prefix}.mlp.experts.{{}}.down_proj.weight", } - if (fd_config.moe_config.num_experts is not None - and layer_id >= fd_config.moe_config.moe_layer_start_index): + if (fd_config.model_config.moe_num_experts is not None + and layer_id >= fd_config.model_config.moe_layer_start_index): self.mlp = FusedMoE(fd_config, - moe_intermediate_size=fd_config.moe_config. + moe_intermediate_size=fd_config.model_config. moe_intermediate_size, - num_experts=fd_config.moe_config.num_experts, - top_k=fd_config.moe_config.top_k, + num_experts=fd_config.model_config.moe_num_experts, + top_k=fd_config.model_config.moe_topk, layer_idx=layer_id, weight_key_map=weight_key_map) else: @@ -199,21 +198,21 @@ def __init__( """ super().__init__() - self.num_layers = fd_config.model_config.num_layers - fd_config.model_config.prefix_name = "model" + self.num_layers = fd_config.model_config.num_hidden_layers + fd_config.model_config.pretrained_config.prefix_name = "model" self.embeddings = VocabParallelEmbedding( fd_config, num_embeddings=fd_config.model_config.vocab_size, embedding_dim=fd_config.model_config.hidden_size, params_dtype=paddle.get_default_dtype, - prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"), + prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"), ) self.layers = nn.LayerList([ Qwen3DecoderLayer( fd_config, - prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") for i in range(self.num_layers) ]) @@ -221,7 +220,7 @@ def __init__( fd_config, hidden_size=fd_config.model_config.hidden_size, eps=1e-6, - prefix=f"{fd_config.model_config.prefix_name}.norm", + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm", ) def load_state_dict(self, state_dict): @@ -338,7 +337,7 @@ def _init_weight(self, layer): return None @classmethod - def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): + def _get_tensor_parallel_mappings(cls, config, is_split=True): # TODO not support TP split now, next PR will support TP. from paddleformers.transformers.conversion_utils import \ @@ -351,7 +350,7 @@ def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): num_attention_heads=config.num_attention_heads, ) - def get_tensor_parallel_split_mappings(num_layers, moe_num_experts): + def get_tensor_parallel_split_mappings(num_layers, num_experts): final_actions = {} base_actions = { @@ -402,23 +401,23 @@ def get_tensor_parallel_split_mappings(num_layers, moe_num_experts): for key, action in base_actions.items(): for i in range(num_layers): newkey = key.replace("layers.0.", f"layers.{i}.") - for j in range(moe_num_experts): + for j in range(num_experts): newkey2 = newkey.replace("experts.0.", f"experts.{j}.") final_actions[newkey2] = action return final_actions - moe_num_experts = 0 + num_experts = 0 if isinstance(config.moe_num_experts, list): - moe_num_experts = sum(config.moe_num_experts) + num_experts = sum(config.moe_num_experts) elif isinstance(config.moe_num_experts, int): - moe_num_experts = config.moe_num_experts + num_experts = config.moe_num_experts else: raise ValueError( - f"Not support type of moe_num_experts [{type(config.moe_num_experts)}]" + f"Not support type of num_experts [{type(config.moe_num_experts)}]" ) - mappings = get_tensor_parallel_split_mappings(config.num_layers, - moe_num_experts) + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, + num_experts) return mappings diff --git a/fastdeploy/model_executor/models/tp_utils.py b/fastdeploy/model_executor/models/tp_utils.py index b7be6ff4db..001a6ce089 100644 --- a/fastdeploy/model_executor/models/tp_utils.py +++ b/fastdeploy/model_executor/models/tp_utils.py @@ -36,10 +36,9 @@ def check_tensor_parallel_prerequisites( safetensor_keys: List[str], ) -> None: """check_tensor_parallel_prerequisites""" - if fd_config.parallel_config.tensor_parallel_degree > 1: + if fd_config.parallel_config.tensor_parallel_size > 1: tensor_parallel_map = cls._get_tensor_parallel_mappings( - fd_config.model_config, is_split=True - ) + fd_config.model_config.pretrained_config, is_split=True) if not tensor_parallel_map: logger.error( "filtered_quant_map should not be empty. \ diff --git a/fastdeploy/rl/rollout_model.py b/fastdeploy/rl/rollout_model.py index 9e599bcfe1..ab16741e41 100644 --- a/fastdeploy/rl/rollout_model.py +++ b/fastdeploy/rl/rollout_model.py @@ -165,7 +165,7 @@ def _add_layer_mappings(layer_idx, is_moe_layer=False): infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.fused_moe.gate_weight"] = \ f"{train_base_name}.{layer_idx}.mlp.gate.weight" - if self.fd_config.moe_config.moe_use_aux_free: + if self.fd_config.model_config.moe_use_aux_free: infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = \ f"{train_base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias" @@ -178,7 +178,7 @@ def _add_layer_mappings(layer_idx, is_moe_layer=False): f"{train_base_name}.{layer_idx}.mlp.shared_experts.down_proj.weight" # MoE experts mappings - for expert_idx in range(self.fd_config.moe_config.num_experts): + for expert_idx in range(self.fd_config.model_config.moe_num_experts): for ph in place_holders: # FFN1 (up_gate_proj) ffn1_key = f"{infer_base_name}.{layer_idx}.mlp.fused_moe.moe_ffn1_weight" @@ -198,12 +198,12 @@ def _add_layer_mappings(layer_idx, is_moe_layer=False): # Process non-MoE layers for layer_idx in range( - self.fd_config.moe_config.moe_layer_start_index): + self.fd_config.model_config.moe_layer_start_index): _add_layer_mappings(layer_idx, is_moe_layer=False) # Process MoE layers - for layer_idx in range(self.fd_config.moe_config.moe_layer_start_index, - self.fd_config.model_config.num_layers): + for layer_idx in range(self.fd_config.model_config.moe_layer_start_index, + self.fd_config.model_config.num_hidden_layers): _add_layer_mappings(layer_idx, is_moe_layer=True) return infer_to_train @@ -278,7 +278,7 @@ def _add_layer_mappings(layer_idx): f"{train_base_name}.{layer_idx}.mlp.down_proj.{ph}" for layer_idx in range( - self.fd_config.model_config.num_layers): + self.fd_config.model_config.num_hidden_layers): _add_layer_mappings(layer_idx) return infer_to_train @@ -396,7 +396,7 @@ def _add_layer_mappings(layer_idx, is_moe_layer=False): ) # Process MoE layers - for layer_idx in range(self.fd_config.model_config.num_layers): + for layer_idx in range(self.fd_config.model_config.num_hidden_layers): _add_layer_mappings(layer_idx, is_moe_layer=True) return infer_to_train diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index cf24a7e578..ed25bc78a4 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -21,6 +21,7 @@ import paddle from fastdeploy.engine.request import Request +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.layers.attention import get_attention_backend from fastdeploy.model_executor.layers.attention.base_attention_backend import \ AttentionBackend @@ -36,7 +37,6 @@ share_external_data) from fastdeploy.model_executor.pre_and_post_process import (pre_process, rebuild_padding) -from fastdeploy.model_executor.forward_meta import ForwardMeta from .base import Proposer @@ -49,7 +49,7 @@ class MTPProposer(Proposer): def __init__(self, cfg, main_model, local_rank, device_id, main_model_inputs): super().__init__(cfg) - self.num_main_model_layers = self.model_config.num_layers + self.num_main_model_layers = self.model_config.num_hidden_layers self.local_rank = local_rank self.device_id = device_id self._update_cfg(main_model) @@ -70,10 +70,10 @@ def _update_cfg(self, main_model): """ self.model_config.architectures[0] = "Ernie4_5_MTPForCausalLM" self.speculative_config.sharing_model = main_model - self.model_config.num_layers = 1 + self.model_config.num_hidden_layers = 1 self.parallel_config.model_name_or_path = ( self.speculative_config.model_name_or_path) - self.model_config.prefix_name = "ernie.mtp_block" + self.model_config.pretrained_config.prefix_name = "ernie.mtp_block" if self.speculative_config.quantization != "": self.model_config.quantization = ( self.speculative_config.quantization) @@ -145,7 +145,7 @@ def initialize_kv_cache(self): cache_kvs_list = [] for i in range( self.num_main_model_layers, - self.num_main_model_layers + self.model_config.num_layers): + self.num_main_model_layers + self.model_config.num_hidden_layers): key_cache = paddle.empty(shape=[], dtype=cache_type) key_cache_name = f"key_caches_{i}_rank{self.local_rank}.device{self.device_id}" val_cache_name = f"value_caches_{i}_rank{self.local_rank}.device{self.device_id}" @@ -159,7 +159,7 @@ def initialize_kv_cache(self): self.model_inputs["caches"] = cache_kvs_list else: - for i in range(self.model_config.num_layers): + for i in range(self.model_config.num_hidden_layers): self.cache_kvs["key_caches_{}".format(i)] = paddle.full( shape=kv_cache_shape, fill_value=0, @@ -183,10 +183,10 @@ def _initialize_attn_backend(self, ) -> None: # TODO(gongshaotian): Get rank from config num_heads = (self.model_config.num_attention_heads // - self.parallel_config.tensor_parallel_degree) + self.parallel_config.tensor_parallel_size) self.model_config.kv_num_heads = ( int(self.model_config.num_key_value_heads) // - self.parallel_config.tensor_parallel_degree) + self.parallel_config.tensor_parallel_size) head_dim = self.model_config.head_dim # Get the attention backend @@ -608,7 +608,7 @@ def _propose(self, target_hidden_states): self.model_inputs, ) - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampled_token_ids, 0) self._post_process(sampled_token_ids) diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index eee5dbf8e1..5756bdbe31 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -670,7 +670,7 @@ def initialize_kv_cache(self) -> None: # Get kv cache shape kv_cache_shape = self.attn_backends[0].get_kv_cache_shape( max_num_blocks=max_block_num) - # local_rank = self.local_rank % self.parallel_config.tensor_parallel_degree + # local_rank = self.local_rank % self.parallel_config.tensor_parallel_size if not self.parallel_config.do_profile and ( self.parallel_config.enable_prefix_caching \ @@ -679,7 +679,7 @@ def initialize_kv_cache(self) -> None: "prefix_caching is not support by GCUModelRunner." ) else: - for i in range(self.model_config.num_layers): + for i in range(self.model_config.num_hidden_layers): cache_kvs["key_caches_{}".format(i)] = paddle.full( shape=kv_cache_shape, @@ -701,10 +701,10 @@ def initialize_attn_backend(self) -> None: """ assert len(self.attn_backends) == 0 - num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree + num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size self.model_config.kv_num_heads = int( self.model_config.num_key_value_heads - ) // self.parallel_config.tensor_parallel_degree + ) // self.parallel_config.tensor_parallel_size head_dim = self.model_config.head_dim # Get the attention backend @@ -783,14 +783,14 @@ def _dummy_run(self, ) sampler_output = self.sampler(logits, self.sampling_metadata) - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) else: self.sampler(logits, self.sampling_metadata, self.parallel_config.max_model_len, self.share_inputs) sampler_output = None - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast( self.share_inputs["accept_tokens"], 0) paddle.distributed.broadcast( @@ -1016,14 +1016,14 @@ class at the server level, which is too granular for ModelRunner. self.sampling_metadata, skip_idx_list, ) - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) else: self.sampler(logits, self.sampling_metadata, self.parallel_config.max_model_len, self.share_inputs) sampler_output = None - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast( self.share_inputs["accept_tokens"], 0) paddle.distributed.broadcast(self.share_inputs["accept_num"], @@ -1192,11 +1192,11 @@ def cal_theortical_kvcache(self): byte_of_dtype = 2 hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads - num_layers = self.model_config.num_layers + \ + num_layers = self.model_config.num_hidden_layers + \ self.speculative_config.num_gpu_block_expand_ratio if \ self.speculative_method in [ "mtp" - ] else self.model_config.num_layers + ] else self.model_config.num_hidden_layers required_memory = ( byte_of_dtype * 2 * # k + v (self.parallel_config.block_size * hidden_dim) * num_layers) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index f22d7cc6ef..655aa5f8c6 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -259,7 +259,7 @@ def insert_prefill_inputs(self, req_dicts: List[Request]): self.share_inputs["min_dec_len"][idx:idx + 1] = request.get( "min_tokens", 1) self.share_inputs["max_dec_len"][idx:idx + 1] = request.get( - "max_tokens", self.model_config.max_length) + "max_tokens", self.model_config.max_model_len) self.share_inputs["stop_flags"][idx:idx + 1] = False self.share_inputs["first_token_ids"][ @@ -375,11 +375,11 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["min_dec_len"] = paddle.full( [max_num_seqs, 1], self.model_config.min_length, dtype='int64') self.share_inputs["max_dec_len"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_length, dtype='int64') + [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64') self.share_inputs["min_length"] = paddle.full( [max_num_seqs, 1], self.model_config.min_length, dtype='int64') self.share_inputs["max_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_length, dtype='int64') + [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64') self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype='int32') @@ -666,13 +666,13 @@ def initialize_kv_cache(self) -> None: # Get kv cache shape kv_cache_shape = self.attn_backends[0].get_kv_cache_shape( max_num_blocks=max_block_num) - local_rank = self.local_rank % self.parallel_config.tensor_parallel_degree + local_rank = self.local_rank % self.parallel_config.tensor_parallel_size if not self.parallel_config.do_profile and ( self.parallel_config.enable_prefix_caching \ or self.parallel_config.splitwise_role != "mixed"): cache_kvs_list = [] - for i in range(self.model_config.num_layers): + for i in range(self.model_config.num_hidden_layers): key_cache = paddle.empty(shape=[], dtype=cache_type) key_cache_name = f"key_caches_{i}_rank{local_rank}.device{self.device_id}" val_cache_name = f"value_caches_{i}_rank{local_rank}.device{self.device_id}" @@ -687,7 +687,7 @@ def initialize_kv_cache(self) -> None: self.share_inputs["caches"] = cache_kvs_list else: - for i in range(self.model_config.num_layers): + for i in range(self.model_config.num_hidden_layers): cache_kvs["key_caches_{}".format(i)] = paddle.full( shape=kv_cache_shape, @@ -710,10 +710,10 @@ def initialize_attn_backend(self) -> None: """ assert len(self.attn_backends) == 0 - num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree + num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size self.model_config.kv_num_heads = max(1, int( self.model_config.num_key_value_heads - ) // self.parallel_config.tensor_parallel_degree) + ) // self.parallel_config.tensor_parallel_size) head_dim = self.model_config.head_dim # Get the attention backend @@ -787,14 +787,14 @@ def _dummy_run(self, ) sampler_output = self.sampler(logits, self.sampling_metadata) - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) else: self.sampler(logits, self.sampling_metadata, self.parallel_config.max_model_len, self.share_inputs) sampler_output = None - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast( self.share_inputs["accept_tokens"], 0) paddle.distributed.broadcast( @@ -1021,14 +1021,14 @@ class at the server level, which is too granular for ModelRunner. self.sampling_metadata, skip_idx_list, ) - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) else: self.sampler(logits, self.sampling_metadata, self.parallel_config.max_model_len, self.share_inputs) sampler_output = None - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast( self.share_inputs["accept_tokens"], 0) paddle.distributed.broadcast(self.share_inputs["accept_num"], @@ -1206,11 +1206,11 @@ def cal_theortical_kvcache(self): hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads # NOTE(liuzichang): Implement multi-layer MTP architecture in the future - num_layers = self.model_config.num_layers + \ + num_layers = self.model_config.num_hidden_layers + \ self.speculative_config.num_gpu_block_expand_ratio if \ self.speculative_method in [ "mtp" - ] else self.model_config.num_layers + ] else self.model_config.num_hidden_layers required_memory = ( byte_of_dtype * 2 * # k + v (self.parallel_config.block_size * hidden_dim) * num_layers) diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index 8d9477b78f..b0caa7d3b8 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -648,7 +648,7 @@ def initialize_kv_cache(self) -> None: or self.parallel_config.splitwise_role != "mixed"): raise NotImplementedError("Iluvatar does not support yet") else: - for i in range(self.model_config.num_layers): + for i in range(self.model_config.num_hidden_layers): cache_kvs["key_caches_{}".format(i)] = paddle.full( shape=kv_cache_shape, @@ -672,11 +672,11 @@ def initialize_attn_backend(self) -> None: assert len(self.attn_backends) == 0 # TODO(gongshaotian): Get rank from config - num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree + num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size self.model_config.kv_num_heads = max( 1, int(self.model_config.num_key_value_heads) // - self.parallel_config.tensor_parallel_degree) + self.parallel_config.tensor_parallel_size) head_dim = self.model_config.head_dim # Get the attention backend @@ -748,14 +748,14 @@ def _dummy_run(self, ) sampled_token_ids = self.sampler(logits, self.sampling_metadata) - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampled_token_ids, 0) else: self.sampler(logits, self.sampling_metadata, self.parallel_config.max_model_len, self.share_inputs) sampled_token_ids = None - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast( self.share_inputs["accept_tokens"], 0) paddle.distributed.broadcast( @@ -977,14 +977,14 @@ class at the server level, which is too granular for ModelRunner. self.sampling_metadata, skip_idx_list, ) - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampled_token_ids, 0) else: self.sampler(logits, self.sampling_metadata, self.parallel_config.max_model_len, self.share_inputs) sampled_token_ids = None - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast( self.share_inputs["accept_tokens"], 0) paddle.distributed.broadcast(self.share_inputs["accept_num"], @@ -1145,11 +1145,11 @@ def cal_theortical_kvcache(self): hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads # NOTE(liuzichang): Implement multi-layer MTP architecture in the future - num_layers = self.model_config.num_layers + \ + num_layers = self.model_config.num_hidden_layers + \ self.speculative_config.num_gpu_block_expand_ratio if \ self.speculative_method in [ "mtp" - ] else self.model_config.num_layers + ] else self.model_config.num_hidden_layers required_memory = ( byte_of_dtype * 2 * # k + v (self.parallel_config.block_size * hidden_dim) * num_layers) diff --git a/fastdeploy/worker/vl_gpu_model_runner.py b/fastdeploy/worker/vl_gpu_model_runner.py index ac54fb2181..82af454b5f 100644 --- a/fastdeploy/worker/vl_gpu_model_runner.py +++ b/fastdeploy/worker/vl_gpu_model_runner.py @@ -29,8 +29,6 @@ from fastdeploy.model_executor.layers.rotary_embedding import get_rope_3d from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata from fastdeploy.model_executor.layers.sample.sampler import Sampler -from fastdeploy.model_executor.models.ernie4_5_vl.configuration import \ - Ernie4_5_VLMoeConfig from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import \ ScatterOp from fastdeploy.platforms import current_platform @@ -221,9 +219,9 @@ def _load_model( fd_config = initialize_fd_config( self.args, self.tensor_parallel_degree, self.tensor_parallel_rank ) - fd_config.model_config = Ernie4_5_VLMoeConfig( - **fd_config.model_config.__dict__ - ) + fd_config.model_config.tensor_parallel_degree=self.tensor_parallel_degree + fd_config.model_config.tensor_parallel_rank=self.tensor_parallel_rank + fd_config.model_config.moe_group="dummy" fd_config.parallel_config.column_cut = False vision_config = fd_config.model_config.vision_config vision_config.attn_sep = False @@ -237,8 +235,8 @@ def _load_model( fd_config.model_config.think_end_id = tokenizer.get_vocab()[""] fd_config.model_config.max_text_id = fd_config.model_config.im_patch_id fd_config.model_config.sequence_parallel = False - # TODO (bukejiyu): Remove the assignment - fd_config.moe_config.top_k = 8 + # TODO(YuanRisheng) The moe_k in develop is fixed to 8, need to be changed according to json config + fd_config.model_config.moe_k = 8 self.fd_config = fd_config self.model_cfg = self.fd_config.model_config self.image_preprocess = self._init_image_preprocess( @@ -250,10 +248,10 @@ def _load_model( self.model = get_model_from_loader(self.fd_config) attn_backend_cls = get_attention_backend() num_heads = self.fd_config.model_config.num_attention_heads // \ - self.fd_config.parallel_config.tensor_parallel_degree + self.fd_config.parallel_config.tensor_parallel_size self.fd_config.model_config.kv_num_heads = int( self.fd_config.model_config.num_key_value_heads - ) // self.fd_config.parallel_config.tensor_parallel_degree + ) // self.fd_config.parallel_config.tensor_parallel_size head_dim = self.fd_config.model_config.head_dim self.attn_backend = attn_backend_cls( self.fd_config, @@ -305,14 +303,10 @@ def _init_kvcache(self): """ cache_kvs = {} total_block_num = self.num_gpu_blocks - num_layers = self.model_cfg.get("num_layers", - None) or self.model_cfg.get( - "num_hidden_layers", None) + num_layers = self.model_cfg.num_hidden_layers + + kv_num_head = self.model_cfg.num_key_value_heads if self.model_cfg.num_key_value_heads != -1 else self.model_cfg.num_attention_heads - kv_num_head = self.model_cfg.get( - "num_key_value_heads", - self.model_cfg.num_attention_heads, - ) kv_num_head = kv_num_head // self.tensor_parallel_degree self.model_cfg.kv_num_head = kv_num_head @@ -647,7 +641,7 @@ def generate(self) -> None: ) # sampler & save_output sampler_output = self.sampler(logits, self.sampling_metadata) - if self.fd_config.parallel_config.tensor_parallel_degree > 1: + if self.fd_config.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) self.post_process(sampler_output) @@ -740,9 +734,7 @@ def _cal_theortical_kvcache(self): """ Calculate the size of kvcache for computational theory """ - num_layers = self.model_cfg.get("num_layers", - None) or self.model_cfg.get( - "num_hidden_layers", None) + num_layers = self.model_cfg.num_hidden_layers byte_of_cache = 2 # support c8 c4 diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index fdcddb47cf..25779767f2 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -22,11 +22,9 @@ import paddle.distributed as dist import paddle.distributed.fleet as fleet -from fastdeploy import envs from fastdeploy.config import (DecodingConfig, DeviceConfig, FDConfig, GraphOptimizationConfig, LoadConfig, - ModelConfig, MoEConfig, MoEPhase, - ParallelConfig, SpeculativeConfig) + ModelConfig, ParallelConfig, SpeculativeConfig) from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue from fastdeploy.inter_communicator import IPCSignal from fastdeploy.model_executor.layers.quantization import \ @@ -122,7 +120,7 @@ def __init__( self.task_queue = TaskQueue( address=task_address, is_server=False, - num_client=self.parallel_config.tensor_parallel_degree, + num_client=self.parallel_config.tensor_parallel_size, client_id=self.parallel_config.tensor_parallel_rank, local_data_parallel_id=self.parallel_config.expert_parallel_rank) @@ -139,8 +137,8 @@ def init_health_status(self) -> None: # init worker_ready_signal max_chips_per_node = 16 if current_platform.is_iluvatar() else 8 array_size = min( - max_chips_per_node, self.parallel_config.tensor_parallel_degree * - self.parallel_config.expert_parallel_degree) + max_chips_per_node, self.parallel_config.tensor_parallel_size * + self.parallel_config.expert_parallel_size) workers_ready = np.zeros(shape=[array_size], dtype=np.int32) self.worker_ready_signal = IPCSignal( name="worker_ready_signal", @@ -173,7 +171,7 @@ def init_health_status(self) -> None: # init exist_task_signal workers_exist_task = np.zeros( - [self.parallel_config.expert_parallel_degree], dtype=np.int32) + [self.parallel_config.expert_parallel_size], dtype=np.int32) self.exist_task_signal = IPCSignal( name="exist_task_signal", array=workers_exist_task, @@ -183,7 +181,7 @@ def init_health_status(self) -> None: # init exist_swapped_task_signal workers_swapped_task = np.zeros( - shape=[self.parallel_config.expert_parallel_degree], + shape=[self.parallel_config.expert_parallel_size], dtype=np.int32) self.exist_swapped_task_signal = IPCSignal( name="exist_swapped_task_signal", @@ -231,8 +229,8 @@ def event_loop_normal(self) -> None: TODO(gongshaotian): support remote calling of functions that control worker. """ # Currently, only support single node - self.nnode = int((self.parallel_config.tensor_parallel_degree + 7) // 8) - mp_num_per_node = self.parallel_config.tensor_parallel_degree // self.nnode + self.nnode = int((self.parallel_config.tensor_parallel_size + 7) // 8) + mp_num_per_node = self.parallel_config.tensor_parallel_size // self.nnode req_ids = [] while True: if self.local_rank == 0: @@ -241,7 +239,7 @@ def event_loop_normal(self) -> None: else: self.exist_task_signal.value[0] = 0 - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: # Synchronize before updating weights paddle.distributed.barrier() @@ -259,7 +257,7 @@ def event_loop_normal(self) -> None: self.fd_config.parallel_config. expert_parallel_rank] = 1 - if self.parallel_config.tensor_parallel_degree > 1: + if self.parallel_config.tensor_parallel_size > 1: # Synchronize the signal for other workers # TODO(@wufeisheng): Split TP group and EP group paddle.distributed.barrier() @@ -479,8 +477,8 @@ def parse_args(): ) parser.add_argument( "--speculative_benchmark_mode", - default="false", - type=str, + default=False, + type=bool, ) parser.add_argument("--max_num_batched_tokens", type=int, @@ -559,7 +557,7 @@ def parse_args(): return args -def initialize_fd_config(config_or_args, ranks: int = 1, local_rank: int = 0) -> FDConfig: +def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: """Initialize FDConfig from either RolloutModelConfig or argparse.Namespace Args: @@ -568,196 +566,37 @@ def initialize_fd_config(config_or_args, ranks: int = 1, local_rank: int = 0) -> Returns: FDConfig: Initialized FastDeploy configuration object """ - # Get model config from model directory - model_config_dict, _ = ModelConfig.get_config_dict(config_or_args.model_name_or_path) - - # Handle MoE related configs - if 'num_experts' in model_config_dict: - model_config_dict['moe_num_experts'] = model_config_dict.pop('num_experts') - if 'num_experts_per_tok' in model_config_dict: - model_config_dict['moe_topk'] = model_config_dict.pop('num_experts_per_tok') - - # Set default values for model config - model_config_dict["head_dim"] = model_config_dict.get( - "head_dim", model_config_dict["hidden_size"] // model_config_dict["num_attention_heads"]) - model_config_dict["rope_theta"] = model_config_dict.get("rope_theta", 10000.0) - - # Create model config object - model_config = ModelConfig.from_dict(model_config_dict) - model_config.head_dim = model_config_dict["head_dim"] - paddle.set_default_dtype(config_or_args.dtype) - if 'tie_word_embeddings' in model_config_dict: - model_config.tie_word_embeddings = model_config_dict['tie_word_embeddings'] - - # Initialize all config components - device_config = DeviceConfig() - decoding_config = DecodingConfig() - speculative_config = SpeculativeConfig() - parallel_config = ParallelConfig() - load_config = LoadConfig() - moe_config = MoEConfig() - - # Handle graph optimization config (check for attribute existence for backward compatibility) - enable_static_graph_inference = getattr(config_or_args, 'enable_static_graph_inference', False) - use_cudagraph = getattr(config_or_args, 'use_cudagraph', False) - max_capture_batch_size = getattr(config_or_args, 'max_capture_batch_size', 0) + paddle.set_default_dtype(args.dtype) + model_config = ModelConfig(vars(args)) + device_config = DeviceConfig(vars(args)) + decoding_config = DecodingConfig(vars(args)) + speculative_config = SpeculativeConfig(vars(args)) + parallel_config = ParallelConfig(vars(args)) + load_config = LoadConfig(vars(args)) graph_opt_config = GraphOptimizationConfig( - enable_static_graph_inference, - use_cudagraph, - max_capture_batch_size - ) + args.enable_static_graph_inference, + args.max_capture_batch_size, + vars(args)) - # Handle quantization (check for attribute existence) - model_config.quantization = getattr(config_or_args, 'quantization', None) + # Note(tangbinhan): used for load_checkpoint + model_config.pretrained_config.tensor_parallel_rank = parallel_config.tensor_parallel_rank + model_config.pretrained_config.tensor_parallel_degree = parallel_config.tensor_parallel_size + model_config.pretrained_config.is_mtp = False + model_config.pretrained_config.head_dim = model_config.head_dim - # Update speculative config_or_args - speculative_config.method = getattr(config_or_args, 'speculative_method', None) - speculative_config.num_speculative_tokens = getattr(config_or_args, 'speculative_max_draft_token_num', 0) - speculative_config.model_name_or_path = getattr(config_or_args, 'speculative_model_name_or_path', None) - speculative_config.quantization = getattr(config_or_args, 'speculative_model_quantization', None) - speculative_config.benchmark_mode = ( - getattr(config_or_args, "speculative_benchmark_mode", "false").lower() == "true" + logger.info(f"parallel_config.use_ep {parallel_config.use_ep}") + logger.info( + f"parallel_config.tensor_parallel_size {parallel_config.tensor_parallel_size}" + ) + logger.info( + f"parallel_config.tensor_parallel_rank {parallel_config.tensor_parallel_rank}" ) - # Update parallel config - parallel_config.engine_pid = getattr(config_or_args, 'engine_pid', None) - parallel_config.model_name_or_path = config_or_args.model_name_or_path - parallel_config.max_num_seqs = getattr(config_or_args, 'max_num_seqs', 0) - parallel_config.max_block_num = getattr(config_or_args, 'total_block_num', 0) - parallel_config.block_size = getattr(config_or_args, 'block_size', 64) - parallel_config.pod_ip = getattr(config_or_args, 'pod_ip', None) - parallel_config.engine_worker_queue_port = getattr(config_or_args, 'engine_worker_queue_port', 0) - parallel_config.max_model_len = getattr(config_or_args, 'max_model_len', 0) - model_config.max_seq_len = getattr(config_or_args, 'max_model_len', 0) - model_config.max_length = getattr(config_or_args, 'max_model_len', 0) - parallel_config.device_ids = getattr(config_or_args, 'device_ids', []) - parallel_config.dtype = config_or_args.dtype - parallel_config.enc_dec_block_num = getattr(config_or_args, 'enc_dec_block_num', 0) - parallel_config.kv_cache_ratio = getattr(config_or_args, 'kv_cache_ratio', 1.0) - parallel_config.first_token_id = getattr(config_or_args, 'first_token_id', None) - parallel_config.gpu_memory_utilization = getattr(config_or_args, 'gpu_memory_utilization', 0.9) - parallel_config.do_profile = getattr(config_or_args, 'do_profile', False) - parallel_config.dynamic_load_weight = getattr(config_or_args, 'dynamic_load_weight', False) - parallel_config.pad_token_id = getattr(config_or_args, 'pad_token_id', None) - parallel_config.eos_tokens_lens = getattr(config_or_args, 'eos_tokens_lens', 0) - parallel_config.enable_chunked_prefill = getattr(config_or_args, 'enable_chunked_prefill', False) - parallel_config.max_num_batched_tokens = getattr(config_or_args, 'max_num_batched_tokens', 0) - parallel_config.enable_prefix_caching = getattr(config_or_args, 'enable_prefix_caching', False) - parallel_config.enable_custom_all_reduce = getattr(config_or_args, 'enable_custom_all_reduce', False) - parallel_config.use_ep = getattr(config_or_args, 'enable_expert_parallell', False) - parallel_config.tensor_parallel_degree = getattr(config_or_args, 'tensor_parallel_size', 1) - parallel_config.expert_parallel_degree = getattr(config_or_args, 'expert_parallel_size', 1) - parallel_config.splitwise_role = getattr(config_or_args, 'splitwise_role', None) - parallel_config.guided_decoding_backend = getattr(config_or_args, 'guided_decoding_backend', None) - parallel_config.disable_any_whitespace = getattr(config_or_args, 'disable_any_whitespace', False) - - # Log parallel config info - logger.info(f"parallel_config.use_ep {parallel_config.use_ep}") - logger.info(f"parallel_config.tensor_parallel_degree {parallel_config.tensor_parallel_degree}") - logger.info(f"splitwise_role {parallel_config.splitwise_role}") - - # Set MoE phase based on splitwise role - if parallel_config.splitwise_role == "mixed": - parallel_config.moe_phase = MoEPhase.PREFILL - elif parallel_config.splitwise_role == "prefill": - parallel_config.moe_phase = MoEPhase.PREFILL - elif parallel_config.splitwise_role == "decode": - parallel_config.moe_phase = MoEPhase.DECODER - elif parallel_config.splitwise_role is not None: - raise NotImplementedError - - # Handle model architecture specific configurations - num_key_value_heads = model_config_dict.get("num_key_value_heads", -1) - if num_key_value_heads is None: - num_key_value_heads = -1 - - # Calculate FFN hidden size - if model_config_dict.get("ffn_hidden_size", None) is not None: - ffn_hidden_size = model_config_dict["ffn_hidden_size"] - elif model_config_dict.get("intermediate_size", None) is not None: - ffn_hidden_size = model_config_dict["intermediate_size"] - else: - ffn_hidden_size = 4 * model_config_dict["hidden_size"] - if model_config_dict["hidden_act"].lower() == "swiglu": - if paddle.distributed.get_world_size() > 1: - multiple_of = 8 * model_config_dict["num_attention_heads"] - else: - multiple_of = 4 * model_config_dict["num_attention_heads"] - ffn_hidden_size = multiple_of * ( - (int(2 * ffn_hidden_size / 3) + multiple_of - 1) // - multiple_of) - - # Get number of layers - num_layers = model_config_dict.get("num_layers", None) or model_config_dict.get( - "num_hidden_layers", None) - if num_layers is None: - raise ValueError(f"num_layers<{num_layers}> is invalid") - - if "moe_layer_start_index" in model_config_dict: - moe_layer_start_index = model_config_dict["moe_layer_start_index"] - use_moe = ( - isinstance(moe_layer_start_index, int) - and moe_layer_start_index < num_layers - ) or ( - isinstance(moe_layer_start_index, list) - and min(moe_layer_start_index) < num_layers - ) - else: - use_moe = False - - # Update model config - model_config.ffn_hidden_size = ffn_hidden_size - model_config.num_layers = num_layers - model_config.num_key_value_heads = num_key_value_heads - model_config.start_layer_index = model_config_dict.get("start_layer_index", 0) - - # Update MoE config - moe_config.num_experts = model_config_dict.get("moe_num_experts", None) - moe_config.moe_intermediate_size = model_config_dict.get("moe_intermediate_size", None) - moe_config.top_k = model_config_dict.get("moe_k", model_config_dict.get("moe_topk", 8)) - moe_config.moe_num_shared_experts = model_config_dict.get("moe_num_shared_experts", 0) - moe_config.moe_layer_start_index = model_config_dict.get("moe_layer_start_index", 0) - moe_config.num_max_dispatch_tokens_per_rank = model_config_dict.get( - "num_max_dispatch_tokens_per_rank", 256) - moe_config.moe_use_aux_free = model_config_dict.get("moe_use_aux_free", False) - - # Handle vocabulary size - model_config.ori_vocab_size = model_config_dict.get("vocab_size", -1) - archs = model_config_dict.get("architectures", []) - if "Ernie4_5_ForCausalLM" in archs or "Ernie4_5_MoeForCausalLM" in archs: - model_config.ori_vocab_size = getattr(config_or_args, 'ori_vocab_size', model_config.ori_vocab_size) - - # Handle DeepseekV3 specific config - if "DeepseekV3ForCausalLM" in model_config_dict.get("architectures", []): - from paddleformers.transformers import AutoConfig - model_config.deepseekv3 = AutoConfig.from_pretrained( - config_or_args.model_name_or_path) - - assert parallel_config.tensor_parallel_degree * parallel_config.expert_parallel_degree == ranks - - parallel_config.tensor_parallel_rank = \ - local_rank % parallel_config.tensor_parallel_degree - parallel_config.expert_parallel_rank = \ - int(local_rank / parallel_config.tensor_parallel_degree) - - if parallel_config.use_ep: - moe_config.num_experts_per_rank = \ - moe_config.num_experts // parallel_config.expert_parallel_degree - moe_config.num_experts_start_offset = \ - parallel_config.expert_parallel_rank * moe_config.num_experts_per_rank - - # For auto TP split - model_config.tensor_parallel_degree = parallel_config.tensor_parallel_degree - model_config.tensor_parallel_rank = parallel_config.tensor_parallel_rank - model_config.use_ep = parallel_config.use_ep - - if parallel_config.use_ep: - model_config.num_experts_per_rank = moe_config.num_experts_per_rank - model_config.num_experts_start_offset = moe_config.num_experts_start_offset - - # Handle quantization config - quantization_config = model_config_dict.get("quantization_config", None) + if getattr(model_config, 'num_hidden_layers', None) is None: + raise ValueError("num_hidden_layers is None") + + quantization_config = model_config.quantization_config if not model_config.is_quantized: if quantization_config is not None: if "kv_cache_quant_type" not in quantization_config: @@ -772,16 +611,15 @@ def initialize_fd_config(config_or_args, ranks: int = 1, local_rank: int = 0) -> if quantization_config is not None: quant_config_name = quantization_config["quantization"] - elif getattr(config_or_args, 'quantization', None) != "None": + elif args.quantization != "None": quantization_config = {} - quant_config_name = getattr(config_or_args, 'quantization', None) + quant_config_name = args.quantization quantization_config["quantization"] = quant_config_name # Special handling for Ernie models - is_ernie = "Ernie4_5_ForCausalLM" in model_config_dict.get("architectures", []) or \ - "Ernie4_5_MoeForCausalLM" in model_config_dict.get("architectures", []) or \ - "Ernie4_5_VLMoeForConditionalGeneration" in model_config_dict.get( - "architectures", []) - if use_moe and quant_config_name == "wint4" and is_ernie: + is_ernie = "Ernie4_5_ForCausalLM" in model_config.architectures or \ + "Ernie4_5_MoeForCausalLM" in model_config.architectures or \ + "Ernie4_5_VLMoeForConditionalGeneration" in model_config.architectures + if quant_config_name == "wint4" and is_ernie: quantization_config["dense_quant_type"] = "wint8" quantization_config["moe_quant_type"] = "wint4" quantization_config["quantization"] = "mix_quant" @@ -806,38 +644,23 @@ def initialize_fd_config(config_or_args, ranks: int = 1, local_rank: int = 0) -> logger.info( "Model Status: Original (will apply online quantization)") - logger.info(f"Quantization Method: {getattr(config_or_args, 'quantization', 'None')}") + logger.info(f"{quantization_config}") else: logger.info( "No quantization config found and use original weight and act dtype." ) - model_config.enable_logprob = config_or_args.enable_logprob - - model_config.architectures = model_config_dict.get("architectures") - - # Update load config - logger.info("===========load_config==============") - # Handle load config (check for environment variable) - load_config.use_fastsafetensor = int(envs.FD_USE_FASTSAFETENSOR) == 1 - load_config.dynamic_load_weight = getattr(config_or_args, 'dynamic_load_weight', False) - load_config.load_strategy = getattr(config_or_args, 'load_strategy', None) logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}") logger.info(f"- Load strategy: {load_config.load_strategy}") - logger.info(f"- Use fastsafetensor: {load_config.use_fastsafetensor}") - - # Create and return FDConfig - fd_config = FDConfig( - model_config=model_config, - parallel_config=parallel_config, - speculative_config=speculative_config, - device_config=device_config, - load_config=load_config, - moe_config=moe_config, - decoding_config=decoding_config, - quant_config=quant_config, - graph_opt_config=graph_opt_config - ) + + fd_config = FDConfig(model_config=model_config, + parallel_config=parallel_config, + speculative_config=speculative_config, + device_config=device_config, + load_config=load_config, + decoding_config=decoding_config, + quant_config=quant_config, + graph_opt_config=graph_opt_config) return fd_config diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 9099339766..12d89c4e79 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -314,7 +314,7 @@ def process_prefill_inputs(self, req_dicts: List[Request]): "min_tokens", 1) self.share_inputs["max_dec_len"][idx:idx + 1] = request.get( - "max_tokens", self.model_config.max_length) + "max_tokens", self.model_config.max_model_len) self.share_inputs["stop_flags"][idx:idx + 1] = False self.share_inputs["first_token_ids"][ @@ -387,11 +387,11 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["min_dec_len"] = paddle.full( [max_num_seqs, 1], self.model_config.min_length, dtype='int64') self.share_inputs["max_dec_len"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_length, dtype='int64') + [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64') self.share_inputs["min_length"] = paddle.full( [max_num_seqs, 1], self.model_config.min_length, dtype='int64') self.share_inputs["max_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_length, dtype='int64') + [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64') self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype='int32') @@ -574,7 +574,7 @@ def initialize_kv_cache(self) -> None: kv_cache_shape = self.attn_backends[0].get_kv_cache_shape( max_num_blocks=max_block_num) - for i in range(self.model_config.num_layers): + for i in range(self.model_config.num_hidden_layers): cache_kvs["key_caches_{}".format(i)] = paddle.full( shape=kv_cache_shape, fill_value=0, @@ -597,10 +597,10 @@ def initialize_attn_backend(self) -> None: assert len(self.attn_backends) == 0 # TODO(gongshaotian): Get rank from config - num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree + num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size self.model_config.kv_num_heads = int( self.model_config.num_key_value_heads - ) // self.parallel_config.tensor_parallel_degree + ) // self.parallel_config.tensor_parallel_size head_dim = self.model_config.head_dim # Get the attention backend @@ -803,7 +803,7 @@ def cal_theortical_kvcache(self): required_memory = ( byte_of_dtype * 2 * # k + v (self.parallel_config.block_size * hidden_dim) * - self.model_config.num_layers) + self.model_config.num_hidden_layers) return required_memory def update_share_input_block_num(self, num_gpu_blocks: int) -> None: