From d4ee86d48b220c4fdfbc4599ec92305eeda7c232 Mon Sep 17 00:00:00 2001
From: David9857 <985700846@qq.com>
Date: Thu, 12 Jun 2025 20:56:18 +0800
Subject: [PATCH 1/3] feat: replace _npu_rotary_embedding with npu_mrope

Signed-off-by: David9857 <985700846@qq.com>
---
 vllm_ascend/ops/rotary_embedding.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py
index 0c2a00afb6..c27b796b1f 100644
--- a/vllm_ascend/ops/rotary_embedding.py
+++ b/vllm_ascend/ops/rotary_embedding.py
@@ -64,13 +64,14 @@ def rope_forward_oot(
         # TODO: Remove the contiguous in the future.
         query = query.contiguous().view(query.shape[0], -1)
         key = key.contiguous().view(key.shape[0], -1)
-        torch_npu._npu_rotary_embedding(
+        query, key = torch_npu.npu_mrope(
             positions,
             query,
             key,
-            self.head_size,
             self.cos_sin_cache,
-            neox_style,
+            self.head_size,
+            mrope_section=[0,0,0],
+            rotary_mode='half' if neox_style else 'interleave'
         )
     return query.view(query_shape), key.view(key_shape)
 

From e989be5e0e41378341eaadd780c15ac57c7ed3ca Mon Sep 17 00:00:00 2001
From: David9857 <985700846@qq.com>
Date: Thu, 12 Jun 2025 21:03:31 +0800
Subject: [PATCH 2/3] ut: add ut for npu_mrope

Signed-off-by: David9857 <985700846@qq.com>
---
 tests/singlecard/ops/test_rotary_embedding.py | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/tests/singlecard/ops/test_rotary_embedding.py b/tests/singlecard/ops/test_rotary_embedding.py
index 2d5ec18daf..e0692f91e7 100644
--- a/tests/singlecard/ops/test_rotary_embedding.py
+++ b/tests/singlecard/ops/test_rotary_embedding.py
@@ -9,6 +9,7 @@
 import pytest
 import torch
 import torch.nn as nn
+import torch_npu
 
 import vllm_ascend.platform  # noqa: F401
 
@@ -196,3 +197,68 @@ def test_rotary_embedding_quant_with_leading_dim(
                                ref_key,
                                atol=DEFAULT_ATOL,
                                rtol=DEFAULT_RTOL)
+
+# test rope with npu_mrope interface with leading dimension and merge seqlen and batch_size as num_tokens
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", DEVICES)
+@torch.inference_mode()
+def test_npu_mrope_quant_with_leading_dim(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: int = 10000,
+) -> None:
+    if rotary_dim is None:
+        rotary_dim = head_size
+
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    rope = RotaryEmbedding(head_size, rotary_dim, max_position, base,
+                           is_neox_style, dtype)
+    rope = rope.to(dtype=dtype)
+    num_tokens = batch_size * seq_len
+    positions = torch.randint(0, max_position, (batch_size * seq_len, ))
+    qkv_tensor = torch.randn(num_tokens,
+                             num_heads * head_size * 3,
+                             dtype=dtype)
+    query, key, _ = qkv_tensor.split(
+        [num_heads * head_size, num_heads * head_size, num_heads * head_size],
+        dim=-1,
+    )
+
+    ref_query, ref_key = rope.forward_native(positions, query, key)
+
+    query, key = torch_npu.npu_mrope(
+            positions,
+            query,
+            key,
+            rope.cos_sin_cache,
+            rope.head_size,
+            mrope_section=[0,0,0],
+            rotary_mode='half' if rope.is_neox_style else 'interleave'
+        )
+
+    # Compare the results.
+    torch.testing.assert_close(query.view(ref_query.size()),
+                               ref_query,
+                               atol=DEFAULT_ATOL,
+                               rtol=DEFAULT_RTOL)
+    torch.testing.assert_close(key.view(ref_key.size()),
+                               ref_key,
+                               atol=DEFAULT_ATOL,
+                               rtol=DEFAULT_RTOL)
\ No newline at end of file

From 97a0908e3646e1d1fc9f6eace54f9cef6efcc36b Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Sun, 15 Jun 2025 12:04:20 +0800
Subject: [PATCH 3/3] Address lint

Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
---
 tests/singlecard/ops/test_rotary_embedding.py | 18 +++++++++---------
 vllm_ascend/ops/rotary_embedding.py           |  5 ++---
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/tests/singlecard/ops/test_rotary_embedding.py b/tests/singlecard/ops/test_rotary_embedding.py
index e0692f91e7..a3e8d7f657 100644
--- a/tests/singlecard/ops/test_rotary_embedding.py
+++ b/tests/singlecard/ops/test_rotary_embedding.py
@@ -198,6 +198,7 @@ def test_rotary_embedding_quant_with_leading_dim(
                                atol=DEFAULT_ATOL,
                                rtol=DEFAULT_RTOL)
 
+
 # test rope with npu_mrope interface with leading dimension and merge seqlen and batch_size as num_tokens
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
@@ -244,14 +245,13 @@ def test_npu_mrope_quant_with_leading_dim(
     ref_query, ref_key = rope.forward_native(positions, query, key)
 
     query, key = torch_npu.npu_mrope(
-            positions,
-            query,
-            key,
-            rope.cos_sin_cache,
-            rope.head_size,
-            mrope_section=[0,0,0],
-            rotary_mode='half' if rope.is_neox_style else 'interleave'
-        )
+        positions,
+        query,
+        key,
+        rope.cos_sin_cache,
+        rope.head_size,
+        mrope_section=[0, 0, 0],
+        rotary_mode='half' if rope.is_neox_style else 'interleave')
 
     # Compare the results.
     torch.testing.assert_close(query.view(ref_query.size()),
@@ -261,4 +261,4 @@ def test_npu_mrope_quant_with_leading_dim(
     torch.testing.assert_close(key.view(ref_key.size()),
                                ref_key,
                                atol=DEFAULT_ATOL,
-                               rtol=DEFAULT_RTOL)
\ No newline at end of file
+                               rtol=DEFAULT_RTOL)
diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py
index c27b796b1f..81dde8b8bf 100644
--- a/vllm_ascend/ops/rotary_embedding.py
+++ b/vllm_ascend/ops/rotary_embedding.py
@@ -70,9 +70,8 @@ def rope_forward_oot(
             key,
             self.cos_sin_cache,
             self.head_size,
-            mrope_section=[0,0,0],
-            rotary_mode='half' if neox_style else 'interleave'
-        )
+            mrope_section=[0, 0, 0],
+            rotary_mode='half' if neox_style else 'interleave')
     return query.view(query_shape), key.view(key_shape)