vllm-project
diff --git a/‎docs/source/installation.md
Lines changed: 8 additions & 1 deletion b/‎docs/source/installation.md
Lines changed: 8 additions & 1 deletion
diff --git a/‎tests/singlecard/test_offline_inference.py
Lines changed: 2 additions & 0 deletions b/‎tests/singlecard/test_offline_inference.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎vllm_ascend/quantization/func_wrapper.py
Lines changed: 151 additions & 0 deletions b/‎vllm_ascend/quantization/func_wrapper.py
Lines changed: 151 additions & 0 deletions
diff --git a/‎vllm_ascend/quantization/quant_config.py
Lines changed: 10 additions & 10 deletions b/‎vllm_ascend/quantization/quant_config.py
Lines changed: 10 additions & 10 deletions
@@ -61,6 +61,7 @@ docker run --rm \
     -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
     -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
     -v /etc/ascend_install.info:/etc/ascend_install.info \
+    -v /root/.cache:/root/.cache \
     -it $IMAGE bash
 ```
 
@@ -123,7 +124,7 @@ First install system dependencies:
 
 ```bash
 apt update  -y
-apt install -y gcc g++ cmake libnuma-dev
+apt install -y gcc g++ cmake libnuma-dev wget
 ```
 
 Current version depends on a unreleased `torch-npu`, you need to install manually:
@@ -144,6 +145,7 @@ cd pta
 wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250320.3/pytorch_v2.5.1_py310.tar.gz
 tar -xvf pytorch_v2.5.1_py310.tar.gz
 pip install ./torch_npu-2.5.1.dev20250320-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
+cd ..
 ```
 
 Then you can install `vllm` and `vllm-ascend` from **pre-built wheel**:
@@ -152,6 +154,8 @@ Then you can install `vllm` and `vllm-ascend` from **pre-built wheel**:
    :substitutions:
 
 # Install vllm-project/vllm from pypi
+# There was a vLLM v0.8.4 installation bug, please use "Build from source code"
+# https://github.com/vllm-project/vllm-ascend/issues/581
 pip install vllm==|pip_vllm_version|
 
 # Install vllm-project/vllm-ascend from pypi.
@@ -168,11 +172,13 @@ or build from **source code**:
 git clone --depth 1 --branch |vllm_version| https://github.com/vllm-project/vllm
 cd vllm
 VLLM_TARGET_DEVICE=empty pip install . --extra-index https://download.pytorch.org/whl/cpu/
+cd ..
 
 # Install vLLM Ascend
 git clone  --depth 1 --branch |vllm_ascend_version| https://github.com/vllm-project/vllm-ascend.git
 cd vllm-ascend
 pip install -e . --extra-index https://download.pytorch.org/whl/cpu/
+cd ..
 ```
 :::
 
@@ -216,6 +222,7 @@ docker run --rm \
     -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
     -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
     -v /etc/ascend_install.info:/etc/ascend_install.info \
+    -v /root/.cache:/root/.cache \
     -it $IMAGE bash
 ```
 
 
@@ -30,7 +30,9 @@
 
 MODELS = [
     "Qwen/Qwen2.5-0.5B-Instruct",
+    "vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8",
 ]
+os.environ["VLLM_USE_MODELSCOPE"] = "True"
 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 
 
 
@@ -0,0 +1,151 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch_npu
+from vllm.logger import logger
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+
+
+# func refers to RMSNorm.__init__
+def wrapper_rmsnorm_init(func):
+
+    def init(self, hidden_size: int, **extra_args) -> None:
+        func(self, hidden_size, **extra_args)
+        self.ignore_anti = True
+        self.bias = torch.nn.Parameter(torch.zeros(hidden_size),
+                                       requires_grad=False)
+
+    return init
+
+
+# func refers to RMSNorm.forward_oot
+def wrapper_rmsnorm_forward_oot(func):
+
+    def _rmsnorm_forward_oot(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if not self.ignore_anti:
+            if residual is not None:
+                residual += x
+                out = torch_npu._npu_quant_rms_norm(
+                    residual,
+                    self.weight,
+                    self.bias,
+                    self.input_scale,
+                    self.input_offset,
+                    self.variance_epsilon,
+                )
+                return out, residual
+            out = torch_npu._npu_quant_rms_norm(
+                x,
+                self.weight,
+                self.bias,
+                self.input_scale,
+                self.input_offset,
+                self.variance_epsilon,
+            )
+            return out
+
+        if residual is not None:
+            x, residual = func(self, x, residual)
+            return x.add_(self.bias), residual
+
+        return func(self, x).add_(self.bias)
+
+    return _rmsnorm_forward_oot
+
+
+MODEL_LAYER_MAPPING = {
+    "LlamaModel": {
+        "attn": {
+            "layer_attr": "self_attn",
+            "proj_attr": "qkv_proj",
+            "norm_attr": "input_layernorm",
+            "unquantized_type": UnquantizedLinearMethod,
+        },
+        "mlp": {
+            "layer_attr": "mlp",
+            "proj_attr": "gate_up_proj",
+            "norm_attr": "post_attention_layernorm",
+            "unquantized_type": UnquantizedLinearMethod,
+        },
+    },
+}
+
+
+def wrapper_load_model(func):
+
+    def postprocess_loading(self) -> None:
+        func(self)
+
+        def process_layer(layer, idx, mapping):
+
+            def process_module(module_cfg, layer_obj):
+                if module_cfg is None:
+                    return
+
+                module_obj = getattr(layer_obj, module_cfg["layer_attr"], None)
+                if module_obj is None:
+                    return
+
+                proj_attr = module_cfg["proj_attr"]
+                if callable(proj_attr):
+                    proj = proj_attr(module_obj, idx)
+                else:
+                    proj = getattr(module_obj, proj_attr, None)
+
+                norm = getattr(layer_obj, module_cfg["norm_attr"], None)
+
+                if proj is None or norm is None:
+                    return
+
+                norm.ignore_anti = isinstance(proj.quant_method,
+                                              module_cfg["unquantized_type"])
+                if not norm.ignore_anti:
+                    for param_name in ["input_scale", "input_offset"]:
+                        if hasattr(proj, param_name):
+                            param = getattr(proj, param_name)
+                            norm.register_parameter(
+                                param_name,
+                                torch.nn.Parameter(param.clone(),
+                                                   requires_grad=False))
+
+            process_module(mapping.get("attn"), layer)
+            process_module(mapping.get("mlp"), layer)
+
+        model_type = self.model.model.__class__.__name__
+        mapping = MODEL_LAYER_MAPPING.get(model_type)
+
+        if not mapping:
+            logger.info(
+                f"Warning: Model type '{model_type}' not found in MODEL_LAYER_MAPPING. Skipping layer mapping."
+            )
+            return
+
+        for idx, layer in enumerate(self.model.model.layers):
+            process_layer(layer, idx, mapping)
+
+        if isinstance(self.model.model.norm, RMSNorm):
+            self.model.model.norm.ignore_anti = True
+
+    return postprocess_loading
@@ -306,23 +306,23 @@ def apply(
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
         router_logits: torch.Tensor,
+        top_k: int,
         renormalize: bool,
-        global_num_experts: int,
-        expert_map: torch.Tensor,
+        use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
-        is_prefill: bool = True,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> torch.Tensor:
-        return self.quant_method.apply(layer, x, use_grouped_topk, top_k,
-                                       router_logits, renormalize, topk_group,
-                                       num_expert_group, global_num_experts,
-                                       expert_map, is_prefill,
+        return self.quant_method.apply(layer, x, router_logits, top_k,
+                                       renormalize, use_grouped_topk,
+                                       topk_group, num_expert_group,
+                                       global_num_experts, expert_map,
                                        custom_routing_function, scoring_func,
                                        e_score_correction_bias)
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,9 @@`
`30`	`30`
`31`	`31`	`MODELS = [`
`32`	`32`	`"Qwen/Qwen2.5-0.5B-Instruct",`
	`33`	`+ "vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8",`
`33`	`34`	`]`
	`35`	`+os.environ["VLLM_USE_MODELSCOPE"] = "True"`
`34`	`36`	`os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"`
`35`	`37`
`36`	`38`