Skip to content

Commit c4718fd

Browse files
authored
Enable SOT D2St in Multimodal Model (#2735)
1 parent f7cad30 commit c4718fd

File tree

2 files changed

+19
-6
lines changed

2 files changed

+19
-6
lines changed

fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
from fastdeploy.config import FDConfig
2828
from fastdeploy.distributed.communication_op import \
2929
tensor_model_parallel_all_reduce
30+
from fastdeploy.model_executor.graph_optimization.decorator import \
31+
support_graph_optimization
3032
from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
3133
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
3234
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
@@ -318,6 +320,7 @@ def forward(
318320
return hidden_states, residual
319321

320322

323+
@support_graph_optimization
321324
class Ernie4_5_VLModel(nn.Layer):
322325

323326
def __init__(
@@ -512,7 +515,8 @@ def forward(
512515
image_features: paddle.Tensor,
513516
forward_meta: ForwardMeta,
514517
):
515-
hidden_states = self.model(ids_remove_padding, image_features,
516-
forward_meta)
518+
hidden_states = self.model(ids_remove_padding=ids_remove_padding,
519+
image_features=image_features,
520+
forward_meta=forward_meta)
517521

518522
return hidden_states

fastdeploy/worker/vl_gpu_model_runner.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,22 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515
"""
16+
import argparse
1617
import json
1718
import os
1819
import random
19-
import argparse
20+
from typing import Optional
2021

2122
import numpy as np
2223
import paddle
2324
import paddle.distributed.fleet as fleet
2425
from paddleformers.transformers.model_utils import load_tp_checkpoint
2526
from safetensors import safe_open
2627

28+
from fastdeploy.config import (DeviceConfig, FDConfig, GraphOptimizationConfig,
29+
KVCacheConfig, LoadConfig, ModelConfig,
30+
MoEConfig, MoEPhase, ParallelConfig,
31+
SpeculativeConfig)
2732
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
2833
from fastdeploy.input.mm_processor import DataProcessor
2934
from fastdeploy.model_executor.layers.attention import get_attention_backend
@@ -44,9 +49,6 @@
4449
from fastdeploy.worker.forward_meta import ForwardMeta
4550
from fastdeploy.worker.utils import check_safetensors_model
4651
from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase
47-
from fastdeploy.config import (DeviceConfig, FDConfig, KVCacheConfig,
48-
LoadConfig, ModelConfig, MoEConfig,
49-
MoEPhase, ParallelConfig, SpeculativeConfig)
5052

5153
if current_platform.is_cuda() and current_platform.available():
5254
from fastdeploy.model_executor.layers.utils import (
@@ -268,13 +270,18 @@ def _load_model(
268270
-1)
269271
self.image_preprocess = image_preprocess
270272

273+
graph_opt_config = GraphOptimizationConfig(
274+
self.args.enable_static_graph_inference, self.args.use_cudagraph,
275+
self.args.max_capture_batch_size)
276+
271277
fd_config, self.model = build_stream_line_model(
272278
self.args.model_name_or_path,
273279
self.args.dtype,
274280
self.args.block_size,
275281
max_model_len=self.args.max_model_len,
276282
tokenizer=tokenizer,
277283
quantization=self.args.quantization,
284+
graph_opt_config=graph_opt_config,
278285
)
279286
self.model.eval()
280287
self.set_state_dict(self.args)
@@ -1050,6 +1057,7 @@ def build_stream_line_model(
10501057
max_model_len: int,
10511058
tokenizer: ErnieBotTokenizer,
10521059
quantization: str = "None",
1060+
graph_opt_config: Optional[GraphOptimizationConfig] = None
10531061
) -> tuple[FDConfig, paddle.nn.layer]:
10541062
"""
10551063
build model
@@ -1221,6 +1229,7 @@ def build_stream_line_model(
12211229
moe_config=moe_config,
12221230
quant_config=quant_config,
12231231
kv_cache_config=kv_cache_config,
1232+
graph_opt_config=graph_opt_config,
12241233
)
12251234
fd_config.parallel_config.max_model_len = max_model_len
12261235
fd_config.model_config.rope_theta = rope_theta

0 commit comments

Comments
 (0)