Skip to content

Commit ed32fcf

Browse files
22dimensionsponix-j
authored andcommitted
enable online serving quantization (#877)
For online serving, "ascend" quantization method is not a choice natively, so we need to add "ascend" quantization method to quantization methods list and the user can enable quantization using "vllm serve --quantization ascend" command. --------- Signed-off-by: 22dimensions <waitingwind@foxmail.com>
1 parent 4512288 commit ed32fcf

File tree

3 files changed

+17
-5
lines changed

3 files changed

+17
-5
lines changed

vllm_ascend/platform.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from vllm.platforms import Platform, PlatformEnum
2626
from vllm.utils import supports_dynamo
2727

28-
from vllm_ascend.utils import update_aclgraph_sizes
28+
from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes
2929

3030
CUSTOM_OP_ENABLED = False
3131
try:
@@ -60,7 +60,7 @@ class NPUPlatform(Platform):
6060
device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"
6161
dispatch_key: str = "PrivateUse1"
6262

63-
supported_quantization: list[str] = ["ascend"]
63+
supported_quantization: list[str] = [ASCEND_QUATIZATION_METHOD]
6464

6565
def is_sleep_mode_available(self) -> bool:
6666
return True
@@ -73,6 +73,15 @@ def pre_register_and_update(cls,
7373
from vllm_ascend.utils import adapt_patch
7474
adapt_patch(is_global_patch=True)
7575

76+
# For online serving, "ascend" quantization method is not a choice natively,
77+
# so we need to add "ascend" quantization method to quantization methods list
78+
# and the user can enable quantization using "vllm serve --quantization ascend".
79+
if parser is not None:
80+
quant_action = parser._option_string_actions.get('--quantization')
81+
if quant_action and hasattr(quant_action, 'choices'):
82+
if ASCEND_QUATIZATION_METHOD not in quant_action.choices:
83+
quant_action.choices.append(ASCEND_QUATIZATION_METHOD)
84+
7685
from vllm_ascend.quantization.quant_config import \
7786
AscendQuantConfig # noqa: F401
7887

vllm_ascend/quantization/quant_config.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,12 @@
3838
from vllm.model_executor.utils import set_weight_attrs
3939

4040
from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod
41+
from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD
4142

4243
from .quantizer import AscendQuantizer
4344

4445

45-
@register_quantization_config("ascend")
46+
@register_quantization_config(ASCEND_QUATIZATION_METHOD)
4647
class AscendQuantConfig(QuantizationConfig):
4748
"""Config class for Ascend
4849
@@ -58,7 +59,7 @@ def __repr__(self) -> str:
5859

5960
@classmethod
6061
def get_name(cls) -> str:
61-
return "ascend"
62+
return ASCEND_QUATIZATION_METHOD
6263

6364
@classmethod
6465
def get_supported_act_dtypes(cls) -> List[torch.dtype]:
@@ -81,7 +82,7 @@ def from_config(cls, config: Dict[str, Any]) -> "AscendQuantConfig":
8182
def override_quantization_method(cls, hf_quant_cfg,
8283
user_quant) -> Optional[str]:
8384
if torch.npu.is_available():
84-
return "ascend"
85+
return ASCEND_QUATIZATION_METHOD
8586
return None
8687

8788
def get_quant_method(self, layer: torch.nn.Module,

vllm_ascend/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@
3838
# Maximum number of graphs that can be captured by ACL Graph
3939
MAX_CAPTURE_SIZE = 1920
4040

41+
ASCEND_QUATIZATION_METHOD = "ascend"
42+
4143

4244
def try_register_lib(lib_name: str, lib_info: str = ""):
4345
import importlib

0 commit comments

Comments
 (0)