|
27 | 27 | from pydantic.dataclasses import dataclass
|
28 | 28 | from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
|
29 | 29 | from torch.distributed import ProcessGroup, ReduceOp
|
30 |
| -from transformers import PretrainedConfig |
31 | 30 | from typing_extensions import Self, deprecated, runtime_checkable
|
32 | 31 |
|
33 | 32 | import vllm.envs as envs
|
34 | 33 | from vllm import version
|
35 | 34 | from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
|
36 | 35 | from vllm.logger import init_logger
|
37 |
| -from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, |
38 |
| - QuantizationMethods, |
39 |
| - get_quantization_config) |
40 |
| -from vllm.model_executor.models import ModelRegistry |
41 | 36 | from vllm.platforms import current_platform
|
42 |
| -from vllm.tracing import is_otel_available, otel_import_error_traceback |
43 | 37 | from vllm.transformers_utils.config import (
|
44 | 38 | ConfigFormat, get_config, get_hf_image_processor_config,
|
45 | 39 | get_hf_text_config, get_pooling_config,
|
|
48 | 42 | try_get_tokenizer_config, uses_mrope)
|
49 | 43 | from vllm.transformers_utils.s3_utils import S3Model
|
50 | 44 | from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
|
| 45 | +# yapf conflicts with isort for this block |
| 46 | +# yapf: disable |
51 | 47 | from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
|
52 | 48 | MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
|
53 | 49 | POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
|
54 |
| - LayerBlockType, common_broadcastable_dtype, |
| 50 | + LayerBlockType, LazyLoader, common_broadcastable_dtype, |
55 | 51 | cuda_device_count_stateless, get_cpu_memory,
|
56 | 52 | get_open_port, is_torch_equal_or_newer, random_uuid,
|
57 | 53 | resolve_obj_by_qualname)
|
58 | 54 |
|
| 55 | +# yapf: enable |
| 56 | + |
59 | 57 | if TYPE_CHECKING:
|
60 | 58 | from _typeshed import DataclassInstance
|
61 | 59 | from ray.util.placement_group import PlacementGroup
|
| 60 | + from transformers.configuration_utils import PretrainedConfig |
62 | 61 |
|
| 62 | + import vllm.model_executor.layers.quantization as me_quant |
| 63 | + import vllm.model_executor.models as me_models |
63 | 64 | from vllm.executor.executor_base import ExecutorBase
|
| 65 | + from vllm.model_executor.layers.quantization import QuantizationMethods |
64 | 66 | from vllm.model_executor.layers.quantization.base_config import (
|
65 | 67 | QuantizationConfig)
|
66 | 68 | from vllm.model_executor.model_loader import BaseModelLoader
|
67 | 69 | from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
68 | 70 |
|
69 | 71 | ConfigType = type[DataclassInstance]
|
| 72 | + HfOverrides = Union[dict, Callable[[type], type]] |
70 | 73 | else:
|
71 | 74 | PlacementGroup = Any
|
| 75 | + PretrainedConfig = Any |
72 | 76 | ExecutorBase = Any
|
73 | 77 | QuantizationConfig = Any
|
| 78 | + QuantizationMethods = Any |
74 | 79 | BaseModelLoader = Any
|
75 | 80 | TensorizerConfig = Any
|
76 | 81 | ConfigType = type
|
| 82 | + HfOverrides = Union[dict[str, Any], Callable[[type], type]] |
| 83 | + |
| 84 | + me_quant = LazyLoader("model_executor", globals(), |
| 85 | + "vllm.model_executor.layers.quantization") |
| 86 | + me_models = LazyLoader("model_executor", globals(), |
| 87 | + "vllm.model_executor.models") |
77 | 88 |
|
78 | 89 | logger = init_logger(__name__)
|
79 | 90 |
|
|
100 | 111 | for task in tasks
|
101 | 112 | }
|
102 | 113 |
|
103 |
| -HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig], |
104 |
| - PretrainedConfig]] |
105 |
| - |
106 | 114 |
|
107 | 115 | @runtime_checkable
|
108 | 116 | class SupportsHash(Protocol):
|
@@ -648,7 +656,7 @@ def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
|
648 | 656 |
|
649 | 657 | @property
|
650 | 658 | def registry(self):
|
651 |
| - return ModelRegistry |
| 659 | + return me_models.ModelRegistry |
652 | 660 |
|
653 | 661 | @property
|
654 | 662 | def architectures(self) -> list[str]:
|
@@ -861,14 +869,15 @@ def _parse_quant_hf_config(self):
|
861 | 869 | return quant_cfg
|
862 | 870 |
|
863 | 871 | def _verify_quantization(self) -> None:
|
864 |
| - supported_quantization = QUANTIZATION_METHODS |
| 872 | + supported_quantization = me_quant.QUANTIZATION_METHODS |
865 | 873 | optimized_quantization_methods = [
|
866 | 874 | "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
|
867 | 875 | "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
|
868 | 876 | "quark", "modelopt_fp4", "bitblas", "gptq_bitblas"
|
869 | 877 | ]
|
870 | 878 | if self.quantization is not None:
|
871 |
| - self.quantization = cast(QuantizationMethods, self.quantization) |
| 879 | + self.quantization = cast(me_quant.QuantizationMethods, |
| 880 | + self.quantization) |
872 | 881 |
|
873 | 882 | # Parse quantization method from the HF model config, if available.
|
874 | 883 | quant_cfg = self._parse_quant_hf_config()
|
@@ -902,14 +911,14 @@ def _verify_quantization(self) -> None:
|
902 | 911 |
|
903 | 912 | # Detect which checkpoint is it
|
904 | 913 | for name in quantization_methods:
|
905 |
| - method = get_quantization_config(name) |
| 914 | + method = me_quant.get_quantization_config(name) |
906 | 915 | quantization_override = method.override_quantization_method(
|
907 | 916 | quant_cfg, self.quantization)
|
908 | 917 | if quantization_override is not None:
|
909 | 918 | # Raise error if the override is not custom (custom would
|
910 | 919 | # be in QUANTIZATION_METHODS but not QuantizationMethods)
|
911 | 920 | # and hasn't been added to the overrides list.
|
912 |
| - if (name in get_args(QuantizationMethods) |
| 921 | + if (name in get_args(me_quant.QuantizationMethods) |
913 | 922 | and name not in overrides):
|
914 | 923 | raise ValueError(
|
915 | 924 | f"Quantization method {name} is an override but "
|
@@ -1419,7 +1428,7 @@ def runner_type(self) -> RunnerType:
|
1419 | 1428 | @property
|
1420 | 1429 | def is_v1_compatible(self) -> bool:
|
1421 | 1430 | architectures = getattr(self.hf_config, "architectures", [])
|
1422 |
| - return ModelRegistry.is_v1_compatible(architectures) |
| 1431 | + return me_models.ModelRegistry.is_v1_compatible(architectures) |
1423 | 1432 |
|
1424 | 1433 | @property
|
1425 | 1434 | def is_matryoshka(self) -> bool:
|
@@ -2378,7 +2387,7 @@ class SpeculativeConfig:
|
2378 | 2387 | according to the log probability settings in SamplingParams."""
|
2379 | 2388 |
|
2380 | 2389 | # Draft model configuration
|
2381 |
| - quantization: Optional[QuantizationMethods] = None |
| 2390 | + quantization: Optional[me_quant.QuantizationMethods] = None |
2382 | 2391 | """Quantization method that was used to quantize the draft model weights.
|
2383 | 2392 | If `None`, we assume the model weights are not quantized. Note that it only
|
2384 | 2393 | takes effect when using the draft model-based speculative method."""
|
@@ -3631,6 +3640,7 @@ def __post_init__(self):
|
3631 | 3640 | and "," in self.collect_detailed_traces[0]):
|
3632 | 3641 | self._parse_collect_detailed_traces()
|
3633 | 3642 |
|
| 3643 | + from vllm.tracing import is_otel_available, otel_import_error_traceback |
3634 | 3644 | if not is_otel_available() and self.otlp_traces_endpoint is not None:
|
3635 | 3645 | raise ValueError(
|
3636 | 3646 | "OpenTelemetry is not available. Unable to configure "
|
|
0 commit comments