【Fearture】support qwen2 some func (#2740)

gzy19990617 · web-flow · commit 26d5d737dd7a · 2025-07-08T12:03:04.000+08:00
* add rl qwen model support

* fix

* fix
diff --git a/fastdeploy/distributed/communication_op.py b/fastdeploy/distributed/communication_op.py
@@ -17,13 +17,15 @@
 import paddle
 import paddle.distributed as dist
 
-
-@paddle.jit.marker.unified
-def tensor_model_parallel_all_reduce(input_: paddle.Tensor) -> paddle.Tensor:
-    """All-reduce the input tensor across model parallel group."""
-    if paddle.in_dynamic_mode():
-        hcg = dist.fleet.get_hybrid_communicate_group()
-        mp_group = hcg.get_model_parallel_group()
-        dist.all_reduce(input_, group=mp_group)
-    else:
-        dist.all_reduce(input_)
+try:
+    @paddle.jit.marker.unified
+    def tensor_model_parallel_all_reduce(input_: paddle.Tensor) -> paddle.Tensor:
+        """All-reduce the input tensor across model parallel group."""
+        if paddle.in_dynamic_mode():
+            hcg = dist.fleet.get_hybrid_communicate_group()
+            mp_group = hcg.get_model_parallel_group()
+            dist.all_reduce(input_, group=mp_group)
+        else:
+            dist.all_reduce(input_)
+except:
+    tensor_model_parallel_all_reduce=None
diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
@@ -21,7 +21,11 @@
 from typing import List, Optional
 
 import paddle
-from paddle.nn.functional.flash_attention import flash_attention_v3_varlen
+
+try:
+    from paddle.nn.functional.flash_attention import flash_attention_v3_varlen
+except:
+    flash_attention_v3_varlen = None
 
 from fastdeploy.config import FDConfig
 from fastdeploy.model_executor.layers.attention.attention import Attention
diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
@@ -294,7 +294,7 @@ def init_weight(self):
         )
         if self.nranks > 0:
             # col parallel
-            _set_var_distributed(self.linear_weight, split_axis=-1)
+            _set_var_distributed(self.linear_weight, split_axis=1)
 
         self.linear_bias = None
         if self.with_bias:
@@ -305,7 +305,7 @@ def init_weight(self):
             )
             if self.nranks > 0:
                 # col parallel
-                _set_var_distributed(self.linear_bias, split_axis=-1)
+                _set_var_distributed(self.linear_bias, split_axis=1)
 
         # smooth quant
         self.linear_shift = None
diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
@@ -89,6 +89,7 @@ def __init__(
         self.routed_scaling_factor = routed_scaling_factor
 
         moe_quant_config = fd_config.quant_config
+        self.moe_quant_type = None
         if moe_quant_config:
             self.quant_method = moe_quant_config.get_quant_method(self)
             self.moe_quant_type = moe_quant_config.name()
@@ -142,7 +143,7 @@ def init_moe_weights(self):
         if self.moe_quant_type == "fp8":
             #(TODO:gaoziyuan)
             pass
-        else:
+        elif self.moe_quant_type == "wint8":
             self.weight_dtype = "int8"
             self.init_weight_only_scale()
 
diff --git a/fastdeploy/model_executor/model_loader.py b/fastdeploy/model_executor/model_loader.py
@@ -91,8 +91,11 @@ def clean_memory_fragments(self, state_dict: dict) -> None:
     def load_model(self, fd_config: FDConfig) -> nn.Layer:
         context = paddle.LazyGuard()
         architectures = fd_config.model_config.architectures[0]
-        # TODO(gongshaotian): Now, only support safetensor
-        model_class = MODEL_CLASSES[architectures]
+
+        if fd_config.load_config.dynamic_load_weight:
+            # register rl model
+            import fastdeploy.rl
+            architectures = architectures + "RL"
 
         with context:
             model_cls = ModelRegistry.get_class(architectures)
@@ -104,6 +107,8 @@ def load_model(self, fd_config: FDConfig) -> nn.Layer:
         if fd_config.load_config.dynamic_load_weight:
             return model
 
+        # TODO(gongshaotian): Now, only support safetensor
+        model_class = MODEL_CLASSES[architectures]
         state_dict = load_composite_checkpoint(
             fd_config.parallel_config.model_name_or_path,
             model_class,
diff --git a/fastdeploy/model_executor/models/__init__.py b/fastdeploy/model_executor/models/__init__.py
@@ -36,8 +36,7 @@ def _find_py_files(root_dir):
 
 
 def auto_models_registry(dir_path,
-                         register_path="fastdeploy.model_executor.models",
-                         suffix=""):
+                         register_path="fastdeploy.model_executor.models"):
     """
     auto registry all models in this folder
     """
@@ -49,7 +48,7 @@ def auto_models_registry(dir_path,
                 if inspect.isclass(attr) and issubclass(
                         attr,
                         ModelForCasualLM) and attr is not ModelForCasualLM:
-                    ModelRegistry.register(attr, suffix=suffix)
+                    ModelRegistry.register(attr)
         except ImportError:
             raise ImportError(f"{module_file=} import error")
 
diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py
@@ -28,12 +28,12 @@ class ModelRegistry:
     _registry = {}
 
     @classmethod
-    def register(cls, model_class, suffix=""):
+    def register(cls, model_class):
         """register model class"""
         if issubclass(
                 model_class,
                 ModelForCasualLM) and model_class is not ModelForCasualLM:
-            cls._registry[f"{model_class.name()}{suffix}"] = model_class
+            cls._registry[model_class.name()] = model_class
         return model_class
 
     @classmethod
diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py
@@ -302,6 +302,7 @@ def __init__(self, fd_config: FDConfig):
         """
         super(Qwen2ForCausalLM, self).__init__(fd_config)
 
+        self.fd_config =fd_config
         self.model = Qwen2Model(fd_config=fd_config)
 
         self.ori_vocab_size = fd_config.model_config.ori_vocab_size
diff --git a/fastdeploy/model_executor/ops/gpu/__init__.py b/fastdeploy/model_executor/ops/gpu/__init__.py
@@ -13,10 +13,19 @@
 # limitations under the License.
 """fastdeploy gpu ops"""
 
-import os
+import sys
+
 from fastdeploy.import_ops import import_custom_ops
 
 PACKAGE = "fastdeploy.model_executor.ops.gpu"
 
 import_custom_ops(PACKAGE, "..base.fastdeploy_base_ops", globals())
 import_custom_ops(PACKAGE, ".fastdeploy_ops", globals())
+
+
+def tolerant_import_error():
+    class NoneModule:
+        def __getattr__(self, name):
+            return None
+
+    sys.modules[__name__] = NoneModule()
diff --git a/fastdeploy/rl/__init__.py b/fastdeploy/rl/__init__.py
@@ -17,4 +17,4 @@
 
 from fastdeploy.model_executor.models import auto_models_registry
 
-auto_models_registry(os.path.dirname(__file__), "fastdeploy.rl", suffix="RL")
+auto_models_registry(os.path.dirname(__file__), "fastdeploy.rl")
diff --git a/fastdeploy/rl/rollout_config.py b/fastdeploy/rl/rollout_config.py
@@ -0,0 +1,108 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from fastdeploy.worker.worker_process import initialize_fd_config
+
+
+class RolloutModelConfig:
+    def __init__(
+        self,
+        model_name_or_path: str,
+        max_model_len: int = 32768,
+        tensor_parallel_size: int = 4,
+        dynamic_load_weight: bool = True,
+        load_strategy: str = "meta",
+        enable_mm: bool = False,
+        # Default values for all other parameters
+        max_num_seqs: int = 34,
+        total_block_num: int = 2000,
+        block_size: int = 64,
+        engine_worker_queue_port: int = 9923,
+        device_ids: str = "0",
+        dtype: str = "bfloat16",
+        enc_dec_block_num: int = 1,
+        kv_cache_ratio: float = 0.7,
+        first_token_id: int = 1,
+        gpu_memory_utilization: float = 0.9,
+        engine_pid: int = None,
+        do_profile: bool = False,
+        pad_token_id: int = -1,
+        eos_tokens_lens: int = 2,
+        enable_chunked_prefill: bool = False,
+        speculative_method: str = None,
+        speculative_max_draft_token_num: int = 1,
+        speculative_model_name_or_path: str = "",
+        speculative_model_quantization: str = "WINT8",
+        max_num_batched_tokens: int = 2048,
+        enable_prefix_caching: bool = False,
+        splitwise_role: str = "mixed",
+        expert_parallel_size: int = 1,
+        enable_expert_parallell: bool = False,
+        ori_vocab_size: int = None,
+        quantization: str = "None",
+        enable_static_graph_inference: bool = False,
+        use_cudagraph: bool = False,
+        max_capture_batch_size: int = 64,
+        guided_decoding_backend: str = "off",
+        disable_any_whitespace: bool = True,
+    ):
+        # Required parameters
+        self.model_name_or_path = model_name_or_path
+        self.max_model_len = max_model_len
+        self.tensor_parallel_size = tensor_parallel_size
+        self.dynamic_load_weight = dynamic_load_weight
+        self.load_strategy = load_strategy
+        self.enable_mm = enable_mm
+
+        # Optional parameters with defaults
+        self.max_num_seqs = max_num_seqs
+        self.total_block_num = total_block_num
+        self.block_size = block_size
+        self.engine_worker_queue_port = engine_worker_queue_port
+        self.device_ids = device_ids
+        self.dtype = dtype
+        self.enc_dec_block_num = enc_dec_block_num
+        self.kv_cache_ratio = kv_cache_ratio
+        self.first_token_id = first_token_id
+        self.gpu_memory_utilization = gpu_memory_utilization
+        self.engine_pid = engine_pid
+        self.do_profile = do_profile
+        self.pad_token_id = pad_token_id
+        self.eos_tokens_lens = eos_tokens_lens
+        self.enable_chunked_prefill = enable_chunked_prefill
+        self.speculative_method = speculative_method
+        self.speculative_max_draft_token_num = speculative_max_draft_token_num
+        self.speculative_model_name_or_path = speculative_model_name_or_path
+        self.speculative_model_quantization = speculative_model_quantization
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.enable_prefix_caching = enable_prefix_caching
+        self.splitwise_role = splitwise_role
+        self.expert_parallel_size = expert_parallel_size
+        self.enable_expert_parallell = enable_expert_parallell
+        self.ori_vocab_size = ori_vocab_size
+        self.quantization = quantization
+        self.enable_static_graph_inference = enable_static_graph_inference
+        self.use_cudagraph = use_cudagraph
+        self.max_capture_batch_size = max_capture_batch_size
+        self.guided_decoding_backend = guided_decoding_backend
+        self.disable_any_whitespace = disable_any_whitespace
+
+    def __str__(self):
+        return "\n".join(f"{k}: {v}" for k, v in self.__dict__.items())
+
+    def initialize(self):
+        """Initialize the final fd config"""
+        return initialize_fd_config(self)
diff --git a/fastdeploy/rl/rollout_model.py b/fastdeploy/rl/rollout_model.py
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py

Original file line number	Diff line number	Diff line change
`@@ -17,4 +17,4 @@`
`17`	`17`
`18`	`18`	`from fastdeploy.model_executor.models import auto_models_registry`
`19`	`19`
`20`		`-auto_models_registry(os.path.dirname(__file__), "fastdeploy.rl", suffix="RL")`
	`20`	`+auto_models_registry(os.path.dirname(__file__), "fastdeploy.rl")`