vllm-project
diff --git a/‎docs/source/tutorials/multi_npu_quantization.md
Lines changed: 16 additions & 22 deletions b/‎docs/source/tutorials/multi_npu_quantization.md
Lines changed: 16 additions & 22 deletions
diff --git a/‎pytest.ini
Lines changed: 2 additions & 0 deletions b/‎pytest.ini
Lines changed: 2 additions & 0 deletions
diff --git a/‎vllm_ascend/attention/attention_v1.py
Lines changed: 10 additions & 2 deletions b/‎vllm_ascend/attention/attention_v1.py
Lines changed: 10 additions & 2 deletions
diff --git a/‎vllm_ascend/attention/mla_v1.py
Lines changed: 8 additions & 3 deletions b/‎vllm_ascend/attention/mla_v1.py
Lines changed: 8 additions & 3 deletions
diff --git a/‎vllm_ascend/models/qwen2_5_vl.py
Lines changed: 109 additions & 4 deletions b/‎vllm_ascend/models/qwen2_5_vl.py
Lines changed: 109 additions & 4 deletions
@@ -1,4 +1,4 @@
-# Multi-NPU (deepseek-v2-lite-w8a8)
+# Multi-NPU (QwQ 32B W8A8)
 
 ## Run docker container:
 :::{note}
@@ -31,60 +31,54 @@ docker run --rm \
 ## Install modelslim and convert model
 :::{note}
 You can choose to convert the model yourself or use the quantized model we uploaded, 
-see https://www.modelscope.cn/models/vllm-ascend/DeepSeek-V2-Lite-w8a8
+see https://www.modelscope.cn/models/vllm-ascend/QwQ-32B-W8A8
 :::
 
 ```bash
-git clone https://gitee.com/ascend/msit
+# (Optional)This tag is recommended and has been verified
+git clone https://gitee.com/ascend/msit -b modelslim-VLLM-8.1.RC1.b020
 
-# (Optional)This commit has been verified
-git checkout a396750f930e3bd2b8aa13730401dcbb4bc684ca
 cd msit/msmodelslim
 # Install by run this script
 bash install.sh
 pip install accelerate
 
-cd /msit/msmodelslim/example/DeepSeek
+cd example/Qwen
 # Original weight path, Replace with your local model path
-MODEL_PATH=/home/weight/DeepSeek-V2-Lite
+MODEL_PATH=/home/models/QwQ-32B
 # Path to save converted weight, Replace with your local path
-SAVE_PATH=/home/weight/DeepSeek-V2-Lite-w8a8
-mkdir -p $SAVE_PATH
+SAVE_PATH=/home/models/QwQ-32B-w8a8
+
 # In this conversion process, the npu device is not must, you can also set --device_type cpu to have a conversion
-python3 quant_deepseek.py --model_path $MODEL_PATH --save_directory $SAVE_PATH --device_type npu --act_method 2 --w_bit 8 --a_bit 8  --is_dynamic True
+python3 quant_qwen.py --model_path $MODEL_PATH --save_directory $SAVE_PATH --calib_file ../common/boolq.jsonl --w_bit 8 --a_bit 8 --device_type npu --anti_method m1 --trust_remote_code True
 ```
 
 ## Verify the quantized model
 The converted model files looks like:
 ```bash
 .
 |-- config.json
-|-- configuration_deepseek.py
-|-- fusion_result.json
+|-- configuration.json
 |-- generation_config.json
-|-- quant_model_description_w8a8_dynamic.json
-|-- quant_model_weight_w8a8_dynamic-00001-of-00004.safetensors
-|-- quant_model_weight_w8a8_dynamic-00002-of-00004.safetensors
-|-- quant_model_weight_w8a8_dynamic-00003-of-00004.safetensors
-|-- quant_model_weight_w8a8_dynamic-00004-of-00004.safetensors
-|-- quant_model_weight_w8a8_dynamic.safetensors.index.json
-|-- tokenization_deepseek_fast.py
+|-- quant_model_description.json
+|-- quant_model_weight_w8a8.safetensors
+|-- README.md
 |-- tokenizer.json
 `-- tokenizer_config.json
 ```
 
 Run the following script to start the vLLM server with quantize model:
 ```bash
-vllm serve /home/weight/DeepSeek-V2-Lite-w8a8  --tensor-parallel-size 4 --trust-remote-code --served-model-name "dpsk-w8a8" --max-model-len 4096
+vllm serve /home/models/QwQ-32B-w8a8  --tensor-parallel-size 4 --served-model-name "qwq-32b-w8a8" --max-model-len 4096 --quantization ascend
 ```
 
 Once your server is started, you can query the model with input prompts
 ```bash
 curl http://localhost:8000/v1/completions \
     -H "Content-Type: application/json" \
     -d '{
-        "model": "dpsk-w8a8",
-        "prompt": "what is deepseek？",
+        "model": "qwq-32b-w8a8",
+        "prompt": "what is large language model?",
         "max_tokens": "128",
         "top_p": "0.95",
         "top_k": "40",
 
@@ -61,6 +61,8 @@ addopts = --ignore=vllm-empty/tests/test_utils.py
           --ignore=vllm-empty/tests/detokenizer/test_stop_reason.py
         ;   oom on llama-2-7b-hf
           --ignore=vllm-empty/tests/detokenizer/test_stop_strings.py
+        ; no need to run on vllm-ascend
+          --ignore=vllm-empty/tests/test_vllm_port.py
 
 testpaths =
     vllm-empty/tests
@@ -30,6 +30,7 @@
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
+from vllm_ascend.utils import vllm_version_is
 
 
 class AscendAttentionBackend(AttentionBackend):
@@ -144,8 +145,15 @@ def build(self, num_reqs, num_actual_tokens, max_query_len,
         query_start_loc_cpu = self.runner.query_start_loc_cpu[:num_reqs + 1]
         query_start_loc = query_start_loc_cpu.to(self.runner.device,
                                                  non_blocking=True)
-        block_table = (
-            self.runner.input_batch.block_table.get_device_tensor()[:num_reqs])
+        if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
+            block_table = (self.runner.input_batch.block_table.
+                           get_device_tensor()[:num_reqs])
+        else:
+            block_table = self.runner.input_batch.block_table[
+                0].get_device_tensor()
+            block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
+                block_table[:num_reqs])
+
         query_lens = self.runner.query_lens
         seq_lens = self.runner.seq_lens_cpu[:num_reqs]
         slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
 
@@ -16,6 +16,7 @@
 
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
+from vllm_ascend.utils import vllm_version_is
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 
 if TYPE_CHECKING:
@@ -238,8 +239,12 @@ def build(self,
         # function. We should avoid GPU -> CPU sync as much as possible because
         # it blocks on all previous kernels.
         device = self.runner.device
-        block_table = (
-            self.runner.input_batch.block_table.get_device_tensor()[:num_reqs])
+        if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
+            block_table = (self.runner.input_batch.block_table.
+                           get_device_tensor()[:num_reqs])
+        else:
+            block_table = (self.runner.input_batch.block_table[0].
+                           get_device_tensor()[:num_reqs])
         slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
             device, non_blocking=True)
         input_positions = self.runner.positions_cpu[:num_actual_tokens].to(
@@ -795,4 +800,4 @@ def forward(
                 output[:num_decode_tokens] = self._forward_decode(
                     decode_ql_nope, decode_q_pe, decode_k_nope, decode_k_pe,
                     kv_cache, attn_metadata)
-        return output_padded
+        return output_padded
@@ -36,9 +36,9 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.qwen2_5_vl import (
     Qwen2_5_VisionAttention, Qwen2_5_VisionBlock, Qwen2_5_VisionPatchEmbed,
-    Qwen2_5_VisionTransformer, Qwen2_5_VLDummyInputsBuilder,
-    Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLMultiModalProcessor,
-    Qwen2_5_VLProcessingInfo)
+    Qwen2_5_VisionRotaryEmbedding, Qwen2_5_VisionTransformer,
+    Qwen2_5_VLDummyInputsBuilder, Qwen2_5_VLForConditionalGeneration,
+    Qwen2_5_VLMultiModalProcessor, Qwen2_5_VLProcessingInfo)
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
@@ -152,6 +152,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
+class AscendQwen2_5_VisionRotaryEmbedding(Qwen2_5_VisionRotaryEmbedding):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__(dim, theta)
+        inv_freq = 1.0 / (theta
+                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.inv_freq = inv_freq
+
+
 class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer):
 
     def __init__(
@@ -166,6 +175,9 @@ def __init__(
         norm_layer = partial(RMSNorm, eps=norm_eps)
         self.interleaved = interleaved
         self.enable_pad = False
+        head_dim = self.hidden_size // self.num_heads
+        self.rotary_pos_emb = AscendQwen2_5_VisionRotaryEmbedding(head_dim //
+                                                                  2)
         self.patch_embed = AscendQwen2_5_VisionPatchEmbed(
             patch_size=vision_config.patch_size,
             temporal_patch_size=vision_config.temporal_patch_size,
@@ -298,6 +310,66 @@ def load_weights(self, weights: Iterable[Tuple[str,
             loaded_params.add(name)
         return loaded_params
 
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def get_window_index(self, grid_thw):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        vit_merger_window_size = (self.window_size //
+                                  self.spatial_merge_size // self.patch_size)
+
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h = grid_h // self.spatial_merge_size
+            llm_grid_w = grid_w // self.spatial_merge_size
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
+                grid_t, llm_grid_h, llm_grid_w)
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100)
+            index_padded = index_padded.reshape(grid_t, num_windows_h,
+                                                vit_merger_window_size,
+                                                num_windows_w,
+                                                vit_merger_window_size)
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t, num_windows_h * num_windows_w, vit_merger_window_size,
+                vit_merger_window_size)
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = seqlens.cumsum(
+                0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+        return window_index, cu_window_seqlens
+
     def forward(
         self,
         x: torch.Tensor,
@@ -366,4 +438,37 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             norm_eps=getattr(config, "rms_norm_eps", 1e-6),
             quant_config=self._maybe_ignore_quant_config(quant_config),
             prefix=maybe_prefix(prefix, "visual"),
-        )
+        )
+
+    def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+
+        # Split concatenated embeddings for each image item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(self, video_input) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        if video_input["type"] == "video_embeds":
+            video_embeds = video_input["video_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values_videos = video_input["pixel_values_videos"].type(
+                self.visual.dtype)
+            video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
+
+        # Split concatenated embeddings for each video item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+        return video_embeds.split(sizes.tolist())