|
27 | 27 | from pydantic.dataclasses import dataclass
|
28 | 28 | from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
|
29 | 29 | from torch.distributed import ProcessGroup, ReduceOp
|
30 |
| -from transformers import PretrainedConfig |
31 | 30 | from typing_extensions import Self, deprecated, runtime_checkable
|
32 | 31 |
|
33 | 32 | import vllm.envs as envs
|
34 | 33 | from vllm import version
|
35 | 34 | from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
|
36 | 35 | from vllm.logger import init_logger
|
37 |
| -from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, |
38 |
| - QuantizationMethods, |
39 |
| - get_quantization_config) |
40 |
| -from vllm.model_executor.models import ModelRegistry |
41 | 36 | from vllm.platforms import current_platform
|
42 |
| -from vllm.tracing import is_otel_available, otel_import_error_traceback |
43 | 37 | from vllm.transformers_utils.config import (
|
44 | 38 | ConfigFormat, get_config, get_hf_image_processor_config,
|
45 | 39 | get_hf_text_config, get_pooling_config,
|
|
48 | 42 | try_get_tokenizer_config, uses_mrope)
|
49 | 43 | from vllm.transformers_utils.s3_utils import S3Model
|
50 | 44 | from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
|
| 45 | +# yapf conflicts with isort for this block |
| 46 | +# yapf: disable |
51 | 47 | from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
|
52 | 48 | MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
|
53 | 49 | POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
|
54 |
| - LayerBlockType, common_broadcastable_dtype, |
| 50 | + LayerBlockType, LazyLoader, common_broadcastable_dtype, |
55 | 51 | cuda_device_count_stateless, get_cpu_memory,
|
56 | 52 | get_open_port, is_torch_equal_or_newer, random_uuid,
|
57 | 53 | resolve_obj_by_qualname)
|
58 | 54 |
|
| 55 | +# yapf: enable |
| 56 | + |
59 | 57 | if TYPE_CHECKING:
|
60 | 58 | from _typeshed import DataclassInstance
|
61 | 59 | from ray.util.placement_group import PlacementGroup
|
| 60 | + from transformers.configuration_utils import PretrainedConfig |
62 | 61 |
|
| 62 | + import vllm.model_executor.layers.quantization as me_quant |
| 63 | + import vllm.model_executor.models as me_models |
63 | 64 | from vllm.executor.executor_base import ExecutorBase
|
| 65 | + from vllm.model_executor.layers.quantization import QuantizationMethods |
64 | 66 | from vllm.model_executor.layers.quantization.base_config import (
|
65 | 67 | QuantizationConfig)
|
66 | 68 | from vllm.model_executor.model_loader import BaseModelLoader
|
67 | 69 | from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
68 | 70 |
|
69 | 71 | ConfigType = type[DataclassInstance]
|
| 72 | + HfOverrides = Union[dict, Callable[[type], type]] |
70 | 73 | else:
|
71 | 74 | PlacementGroup = Any
|
| 75 | + PretrainedConfig = Any |
72 | 76 | ExecutorBase = Any
|
73 | 77 | QuantizationConfig = Any
|
| 78 | + QuantizationMethods = Any |
74 | 79 | BaseModelLoader = Any
|
75 | 80 | TensorizerConfig = Any
|
76 | 81 | ConfigType = type
|
| 82 | + HfOverrides = Union[dict[str, Any], Callable[[type], type]] |
| 83 | + |
| 84 | + me_quant = LazyLoader("model_executor", globals(), |
| 85 | + "vllm.model_executor.layers.quantization") |
| 86 | + me_models = LazyLoader("model_executor", globals(), |
| 87 | + "vllm.model_executor.models") |
77 | 88 |
|
78 | 89 | logger = init_logger(__name__)
|
79 | 90 |
|
|
100 | 111 | for task in tasks
|
101 | 112 | }
|
102 | 113 |
|
103 |
| -HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig], |
104 |
| - PretrainedConfig]] |
105 |
| - |
106 | 114 |
|
107 | 115 | @runtime_checkable
|
108 | 116 | class SupportsHash(Protocol):
|
@@ -648,7 +656,7 @@ def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
|
648 | 656 |
|
649 | 657 | @property
|
650 | 658 | def registry(self):
|
651 |
| - return ModelRegistry |
| 659 | + return me_models.ModelRegistry |
652 | 660 |
|
653 | 661 | @property
|
654 | 662 | def architectures(self) -> list[str]:
|
@@ -859,14 +867,15 @@ def _parse_quant_hf_config(self):
|
859 | 867 | return quant_cfg
|
860 | 868 |
|
861 | 869 | def _verify_quantization(self) -> None:
|
862 |
| - supported_quantization = QUANTIZATION_METHODS |
| 870 | + supported_quantization = me_quant.QUANTIZATION_METHODS |
863 | 871 | optimized_quantization_methods = [
|
864 | 872 | "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
|
865 | 873 | "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
|
866 | 874 | "quark", "modelopt_fp4", "bitblas", "gptq_bitblas"
|
867 | 875 | ]
|
868 | 876 | if self.quantization is not None:
|
869 |
| - self.quantization = cast(QuantizationMethods, self.quantization) |
| 877 | + self.quantization = cast(me_quant.QuantizationMethods, |
| 878 | + self.quantization) |
870 | 879 |
|
871 | 880 | # Parse quantization method from the HF model config, if available.
|
872 | 881 | quant_cfg = self._parse_quant_hf_config()
|
@@ -900,14 +909,14 @@ def _verify_quantization(self) -> None:
|
900 | 909 |
|
901 | 910 | # Detect which checkpoint is it
|
902 | 911 | for name in quantization_methods:
|
903 |
| - method = get_quantization_config(name) |
| 912 | + method = me_quant.get_quantization_config(name) |
904 | 913 | quantization_override = method.override_quantization_method(
|
905 | 914 | quant_cfg, self.quantization)
|
906 | 915 | if quantization_override is not None:
|
907 | 916 | # Raise error if the override is not custom (custom would
|
908 | 917 | # be in QUANTIZATION_METHODS but not QuantizationMethods)
|
909 | 918 | # and hasn't been added to the overrides list.
|
910 |
| - if (name in get_args(QuantizationMethods) |
| 919 | + if (name in get_args(me_quant.QuantizationMethods) |
911 | 920 | and name not in overrides):
|
912 | 921 | raise ValueError(
|
913 | 922 | f"Quantization method {name} is an override but "
|
@@ -1417,7 +1426,7 @@ def runner_type(self) -> RunnerType:
|
1417 | 1426 | @property
|
1418 | 1427 | def is_v1_compatible(self) -> bool:
|
1419 | 1428 | architectures = getattr(self.hf_config, "architectures", [])
|
1420 |
| - return ModelRegistry.is_v1_compatible(architectures) |
| 1429 | + return me_models.ModelRegistry.is_v1_compatible(architectures) |
1421 | 1430 |
|
1422 | 1431 | @property
|
1423 | 1432 | def is_matryoshka(self) -> bool:
|
@@ -2376,7 +2385,7 @@ class SpeculativeConfig:
|
2376 | 2385 | according to the log probability settings in SamplingParams."""
|
2377 | 2386 |
|
2378 | 2387 | # Draft model configuration
|
2379 |
| - quantization: Optional[QuantizationMethods] = None |
| 2388 | + quantization: Optional[me_quant.QuantizationMethods] = None |
2380 | 2389 | """Quantization method that was used to quantize the draft model weights.
|
2381 | 2390 | If `None`, we assume the model weights are not quantized. Note that it only
|
2382 | 2391 | takes effect when using the draft model-based speculative method."""
|
@@ -3624,6 +3633,7 @@ def __post_init__(self):
|
3624 | 3633 | and "," in self.collect_detailed_traces[0]):
|
3625 | 3634 | self._parse_collect_detailed_traces()
|
3626 | 3635 |
|
| 3636 | + from vllm.tracing import is_otel_available, otel_import_error_traceback |
3627 | 3637 | if not is_otel_available() and self.otlp_traces_endpoint is not None:
|
3628 | 3638 | raise ValueError(
|
3629 | 3639 | "OpenTelemetry is not available. Unable to configure "
|
|
0 commit comments