|
13 | 13 | # See the License for the specific language governing permissions and
|
14 | 14 | # limitations under the License.
|
15 | 15 | """
|
| 16 | +import argparse |
16 | 17 | import json
|
17 | 18 | import os
|
18 | 19 | import random
|
19 |
| -import argparse |
| 20 | +from typing import Optional |
20 | 21 |
|
21 | 22 | import numpy as np
|
22 | 23 | import paddle
|
23 | 24 | import paddle.distributed.fleet as fleet
|
24 | 25 | from paddleformers.transformers.model_utils import load_tp_checkpoint
|
25 | 26 | from safetensors import safe_open
|
26 | 27 |
|
| 28 | +from fastdeploy.config import (DeviceConfig, FDConfig, GraphOptimizationConfig, |
| 29 | + KVCacheConfig, LoadConfig, ModelConfig, |
| 30 | + MoEConfig, MoEPhase, ParallelConfig, |
| 31 | + SpeculativeConfig) |
27 | 32 | from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
|
28 | 33 | from fastdeploy.input.mm_processor import DataProcessor
|
29 | 34 | from fastdeploy.model_executor.layers.attention import get_attention_backend
|
|
44 | 49 | from fastdeploy.worker.forward_meta import ForwardMeta
|
45 | 50 | from fastdeploy.worker.utils import check_safetensors_model
|
46 | 51 | from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase
|
47 |
| -from fastdeploy.config import (DeviceConfig, FDConfig, KVCacheConfig, |
48 |
| - LoadConfig, ModelConfig, MoEConfig, |
49 |
| - MoEPhase, ParallelConfig, SpeculativeConfig) |
50 | 52 |
|
51 | 53 | if current_platform.is_cuda() and current_platform.available():
|
52 | 54 | from fastdeploy.model_executor.layers.utils import (
|
@@ -268,13 +270,18 @@ def _load_model(
|
268 | 270 | -1)
|
269 | 271 | self.image_preprocess = image_preprocess
|
270 | 272 |
|
| 273 | + graph_opt_config = GraphOptimizationConfig( |
| 274 | + self.args.enable_static_graph_inference, self.args.use_cudagraph, |
| 275 | + self.args.max_capture_batch_size) |
| 276 | + |
271 | 277 | fd_config, self.model = build_stream_line_model(
|
272 | 278 | self.args.model_name_or_path,
|
273 | 279 | self.args.dtype,
|
274 | 280 | self.args.block_size,
|
275 | 281 | max_model_len=self.args.max_model_len,
|
276 | 282 | tokenizer=tokenizer,
|
277 | 283 | quantization=self.args.quantization,
|
| 284 | + graph_opt_config=graph_opt_config, |
278 | 285 | )
|
279 | 286 | self.model.eval()
|
280 | 287 | self.set_state_dict(self.args)
|
@@ -1050,6 +1057,7 @@ def build_stream_line_model(
|
1050 | 1057 | max_model_len: int,
|
1051 | 1058 | tokenizer: ErnieBotTokenizer,
|
1052 | 1059 | quantization: str = "None",
|
| 1060 | + graph_opt_config: Optional[GraphOptimizationConfig] = None |
1053 | 1061 | ) -> tuple[FDConfig, paddle.nn.layer]:
|
1054 | 1062 | """
|
1055 | 1063 | build model
|
@@ -1221,6 +1229,7 @@ def build_stream_line_model(
|
1221 | 1229 | moe_config=moe_config,
|
1222 | 1230 | quant_config=quant_config,
|
1223 | 1231 | kv_cache_config=kv_cache_config,
|
| 1232 | + graph_opt_config=graph_opt_config, |
1224 | 1233 | )
|
1225 | 1234 | fd_config.parallel_config.max_model_len = max_model_len
|
1226 | 1235 | fd_config.model_config.rope_theta = rope_theta
|
|
0 commit comments