Skip to content

Commit 920cc41

Browse files
davidxiawwl2755-google
authored andcommitted
[Frontend] speed up import time of vllm.config (vllm-project#18036)
Signed-off-by: David Xia <david@davidxia.com>
1 parent 4becd1f commit 920cc41

File tree

1 file changed

+27
-17
lines changed

1 file changed

+27
-17
lines changed

vllm/config.py

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,19 +27,13 @@
2727
from pydantic.dataclasses import dataclass
2828
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
2929
from torch.distributed import ProcessGroup, ReduceOp
30-
from transformers import PretrainedConfig
3130
from typing_extensions import Self, deprecated, runtime_checkable
3231

3332
import vllm.envs as envs
3433
from vllm import version
3534
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
3635
from vllm.logger import init_logger
37-
from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
38-
QuantizationMethods,
39-
get_quantization_config)
40-
from vllm.model_executor.models import ModelRegistry
4136
from vllm.platforms import current_platform
42-
from vllm.tracing import is_otel_available, otel_import_error_traceback
4337
from vllm.transformers_utils.config import (
4438
ConfigFormat, get_config, get_hf_image_processor_config,
4539
get_hf_text_config, get_pooling_config,
@@ -48,32 +42,49 @@
4842
try_get_tokenizer_config, uses_mrope)
4943
from vllm.transformers_utils.s3_utils import S3Model
5044
from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
45+
# yapf conflicts with isort for this block
46+
# yapf: disable
5147
from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
5248
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
5349
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
54-
LayerBlockType, common_broadcastable_dtype,
50+
LayerBlockType, LazyLoader, common_broadcastable_dtype,
5551
cuda_device_count_stateless, get_cpu_memory,
5652
get_open_port, is_torch_equal_or_newer, random_uuid,
5753
resolve_obj_by_qualname)
5854

55+
# yapf: enable
56+
5957
if TYPE_CHECKING:
6058
from _typeshed import DataclassInstance
6159
from ray.util.placement_group import PlacementGroup
60+
from transformers.configuration_utils import PretrainedConfig
6261

62+
import vllm.model_executor.layers.quantization as me_quant
63+
import vllm.model_executor.models as me_models
6364
from vllm.executor.executor_base import ExecutorBase
65+
from vllm.model_executor.layers.quantization import QuantizationMethods
6466
from vllm.model_executor.layers.quantization.base_config import (
6567
QuantizationConfig)
6668
from vllm.model_executor.model_loader import BaseModelLoader
6769
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
6870

6971
ConfigType = type[DataclassInstance]
72+
HfOverrides = Union[dict, Callable[[type], type]]
7073
else:
7174
PlacementGroup = Any
75+
PretrainedConfig = Any
7276
ExecutorBase = Any
7377
QuantizationConfig = Any
78+
QuantizationMethods = Any
7479
BaseModelLoader = Any
7580
TensorizerConfig = Any
7681
ConfigType = type
82+
HfOverrides = Union[dict[str, Any], Callable[[type], type]]
83+
84+
me_quant = LazyLoader("model_executor", globals(),
85+
"vllm.model_executor.layers.quantization")
86+
me_models = LazyLoader("model_executor", globals(),
87+
"vllm.model_executor.models")
7788

7889
logger = init_logger(__name__)
7990

@@ -100,9 +111,6 @@
100111
for task in tasks
101112
}
102113

103-
HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig],
104-
PretrainedConfig]]
105-
106114

107115
@runtime_checkable
108116
class SupportsHash(Protocol):
@@ -648,7 +656,7 @@ def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
648656

649657
@property
650658
def registry(self):
651-
return ModelRegistry
659+
return me_models.ModelRegistry
652660

653661
@property
654662
def architectures(self) -> list[str]:
@@ -859,14 +867,15 @@ def _parse_quant_hf_config(self):
859867
return quant_cfg
860868

861869
def _verify_quantization(self) -> None:
862-
supported_quantization = QUANTIZATION_METHODS
870+
supported_quantization = me_quant.QUANTIZATION_METHODS
863871
optimized_quantization_methods = [
864872
"fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
865873
"awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
866874
"quark", "modelopt_fp4", "bitblas", "gptq_bitblas"
867875
]
868876
if self.quantization is not None:
869-
self.quantization = cast(QuantizationMethods, self.quantization)
877+
self.quantization = cast(me_quant.QuantizationMethods,
878+
self.quantization)
870879

871880
# Parse quantization method from the HF model config, if available.
872881
quant_cfg = self._parse_quant_hf_config()
@@ -900,14 +909,14 @@ def _verify_quantization(self) -> None:
900909

901910
# Detect which checkpoint is it
902911
for name in quantization_methods:
903-
method = get_quantization_config(name)
912+
method = me_quant.get_quantization_config(name)
904913
quantization_override = method.override_quantization_method(
905914
quant_cfg, self.quantization)
906915
if quantization_override is not None:
907916
# Raise error if the override is not custom (custom would
908917
# be in QUANTIZATION_METHODS but not QuantizationMethods)
909918
# and hasn't been added to the overrides list.
910-
if (name in get_args(QuantizationMethods)
919+
if (name in get_args(me_quant.QuantizationMethods)
911920
and name not in overrides):
912921
raise ValueError(
913922
f"Quantization method {name} is an override but "
@@ -1417,7 +1426,7 @@ def runner_type(self) -> RunnerType:
14171426
@property
14181427
def is_v1_compatible(self) -> bool:
14191428
architectures = getattr(self.hf_config, "architectures", [])
1420-
return ModelRegistry.is_v1_compatible(architectures)
1429+
return me_models.ModelRegistry.is_v1_compatible(architectures)
14211430

14221431
@property
14231432
def is_matryoshka(self) -> bool:
@@ -2376,7 +2385,7 @@ class SpeculativeConfig:
23762385
according to the log probability settings in SamplingParams."""
23772386

23782387
# Draft model configuration
2379-
quantization: Optional[QuantizationMethods] = None
2388+
quantization: Optional[me_quant.QuantizationMethods] = None
23802389
"""Quantization method that was used to quantize the draft model weights.
23812390
If `None`, we assume the model weights are not quantized. Note that it only
23822391
takes effect when using the draft model-based speculative method."""
@@ -3624,6 +3633,7 @@ def __post_init__(self):
36243633
and "," in self.collect_detailed_traces[0]):
36253634
self._parse_collect_detailed_traces()
36263635

3636+
from vllm.tracing import is_otel_available, otel_import_error_traceback
36273637
if not is_otel_available() and self.otlp_traces_endpoint is not None:
36283638
raise ValueError(
36293639
"OpenTelemetry is not available. Unable to configure "

0 commit comments

Comments
 (0)