enable online serving quantization (#877)

22dimensions · ponix-j · commit ed32fcf85dcb · 2025-05-19T12:03:18.000+08:00
For online serving, "ascend" quantization method is not a choice
natively, so we need to add "ascend" quantization method to quantization
methods list and the user can enable quantization using "vllm serve
--quantization ascend" command.

---------

Signed-off-by: 22dimensions &lt;waitingwind@foxmail.com&gt;
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -25,7 +25,7 @@
 from vllm.platforms import Platform, PlatformEnum
 from vllm.utils import supports_dynamo
 
-from vllm_ascend.utils import update_aclgraph_sizes
+from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes
 
 CUSTOM_OP_ENABLED = False
 try:
@@ -60,7 +60,7 @@ class NPUPlatform(Platform):
     device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"
     dispatch_key: str = "PrivateUse1"
 
-    supported_quantization: list[str] = ["ascend"]
+    supported_quantization: list[str] = [ASCEND_QUATIZATION_METHOD]
 
     def is_sleep_mode_available(self) -> bool:
         return True
@@ -73,6 +73,15 @@ def pre_register_and_update(cls,
         from vllm_ascend.utils import adapt_patch
         adapt_patch(is_global_patch=True)
 
+        # For online serving, "ascend" quantization method is not a choice natively,
+        # so we need to add "ascend" quantization method to quantization methods list
+        # and the user can enable quantization using "vllm serve --quantization ascend".
+        if parser is not None:
+            quant_action = parser._option_string_actions.get('--quantization')
+            if quant_action and hasattr(quant_action, 'choices'):
+                if ASCEND_QUATIZATION_METHOD not in quant_action.choices:
+                    quant_action.choices.append(ASCEND_QUATIZATION_METHOD)
+
         from vllm_ascend.quantization.quant_config import \
             AscendQuantConfig  # noqa: F401
 
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
@@ -38,11 +38,12 @@
 from vllm.model_executor.utils import set_weight_attrs
 
 from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod
+from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD
 
 from .quantizer import AscendQuantizer
 
 
-@register_quantization_config("ascend")
+@register_quantization_config(ASCEND_QUATIZATION_METHOD)
 class AscendQuantConfig(QuantizationConfig):
     """Config class for Ascend
     
@@ -58,7 +59,7 @@ def __repr__(self) -> str:
 
     @classmethod
     def get_name(cls) -> str:
-        return "ascend"
+        return ASCEND_QUATIZATION_METHOD
 
     @classmethod
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
@@ -81,7 +82,7 @@ def from_config(cls, config: Dict[str, Any]) -> "AscendQuantConfig":
     def override_quantization_method(cls, hf_quant_cfg,
                                      user_quant) -> Optional[str]:
         if torch.npu.is_available():
-            return "ascend"
+            return ASCEND_QUATIZATION_METHOD
         return None
 
     def get_quant_method(self, layer: torch.nn.Module,
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -38,6 +38,8 @@
 # Maximum number of graphs that can be captured by ACL Graph
 MAX_CAPTURE_SIZE = 1920
 
+ASCEND_QUATIZATION_METHOD = "ascend"
+
 
 def try_register_lib(lib_name: str, lib_info: str = ""):
     import importlib