Skip to content

Commit cf394db

Browse files
committed
fix
1 parent 8d72ccc commit cf394db

File tree

3 files changed

+12
-10
lines changed

3 files changed

+12
-10
lines changed

fastdeploy/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def __init__(
187187
else:
188188
raise NotImplementedError
189189
# enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce).
190-
enable_custom_all_reduce: str = "store_true"
190+
self.enable_custom_all_reduce: bool = False
191191

192192
@dataclass
193193
class SpeculativeConfig:
@@ -225,7 +225,7 @@ def __init__(
225225
# During benchmarking, we need to enforce that the number of accepted tokens is 1.
226226
# This means no tokens from MTP are accepted.
227227
# This ensures that the specified simulation acceptance rate is not affected.
228-
benchmark_mode: bool = False
228+
self.benchmark_mode: bool = False
229229

230230
for key, value in args.items():
231231
if hasattr(self, key):

fastdeploy/worker/vl_gpu_model_runner.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,10 @@
2525
from paddleformers.transformers.model_utils import load_tp_checkpoint
2626
from safetensors import safe_open
2727

28-
from fastdeploy.config import (DecodingConfig, DeviceConfig, FDConfig, GraphOptimizationConfig,
29-
LoadConfig, ModelConfig, MoEPhase,
30-
ParallelConfig, SpeculativeConfig)
28+
from fastdeploy.config import (DecodingConfig, DeviceConfig, FDConfig,
29+
GraphOptimizationConfig, LoadConfig,
30+
ModelConfig, MoEPhase, ParallelConfig,
31+
SpeculativeConfig)
3132
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
3233
from fastdeploy.input.mm_processor import DataProcessor
3334
from fastdeploy.model_executor.layers.attention import get_attention_backend
@@ -266,8 +267,9 @@ def _load_model(
266267
self.image_preprocess = image_preprocess
267268

268269
graph_opt_config = GraphOptimizationConfig(
269-
self.args.enable_static_graph_inference, self.args.use_cudagraph,
270-
self.args.max_capture_batch_size)
270+
self.args.enable_static_graph_inference,
271+
self.args.max_capture_batch_size,
272+
vars(self.args))
271273

272274
fd_config, self.model = build_stream_line_model(
273275
self.args.model_name_or_path,

fastdeploy/worker/worker_process.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -615,9 +615,9 @@ def initialize_fd_config(args: argparse.Namespace) -> FDConfig:
615615

616616
if quantization_config is not None:
617617
quant_config_name = quantization_config["quantization"]
618-
elif getattr(config_or_args, 'quantization', None) != "None":
618+
elif args.quantization != "None":
619619
quantization_config = {}
620-
quant_config_name = getattr(config_or_args, 'quantization', None)
620+
quant_config_name = args.quantization
621621
quantization_config["quantization"] = quant_config_name
622622
# Special handling for Ernie models
623623
is_ernie = "Ernie4_5_ForCausalLM" in model_config.architectures or \
@@ -647,7 +647,7 @@ def initialize_fd_config(args: argparse.Namespace) -> FDConfig:
647647
logger.info(
648648
"Model Status: Original (will apply online quantization)")
649649

650-
logger.info(f"Quantization Method: {getattr(config_or_args, 'quantization', 'None')}")
650+
logger.info(f"{quantization_config}")
651651
else:
652652
logger.info(
653653
"No quantization config found and use original weight and act dtype."

0 commit comments

Comments
 (0)