From d4ee86d48b220c4fdfbc4599ec92305eeda7c232 Mon Sep 17 00:00:00 2001 From: David9857 <985700846@qq.com> Date: Thu, 12 Jun 2025 20:56:18 +0800 Subject: [PATCH 1/3] feat: replace _npu_rotary_embedding with npu_mrope Signed-off-by: David9857 <985700846@qq.com> --- vllm_ascend/ops/rotary_embedding.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index 0c2a00afb6..c27b796b1f 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -64,13 +64,14 @@ def rope_forward_oot( # TODO: Remove the contiguous in the future. query = query.contiguous().view(query.shape[0], -1) key = key.contiguous().view(key.shape[0], -1) - torch_npu._npu_rotary_embedding( + query, key = torch_npu.npu_mrope( positions, query, key, - self.head_size, self.cos_sin_cache, - neox_style, + self.head_size, + mrope_section=[0,0,0], + rotary_mode='half' if neox_style else 'interleave' ) return query.view(query_shape), key.view(key_shape) From e989be5e0e41378341eaadd780c15ac57c7ed3ca Mon Sep 17 00:00:00 2001 From: David9857 <985700846@qq.com> Date: Thu, 12 Jun 2025 21:03:31 +0800 Subject: [PATCH 2/3] ut: add ut for npu_mrope Signed-off-by: David9857 <985700846@qq.com> --- tests/singlecard/ops/test_rotary_embedding.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/tests/singlecard/ops/test_rotary_embedding.py b/tests/singlecard/ops/test_rotary_embedding.py index 2d5ec18daf..e0692f91e7 100644 --- a/tests/singlecard/ops/test_rotary_embedding.py +++ b/tests/singlecard/ops/test_rotary_embedding.py @@ -9,6 +9,7 @@ import pytest import torch import torch.nn as nn +import torch_npu import vllm_ascend.platform # noqa: F401 @@ -196,3 +197,68 @@ def test_rotary_embedding_quant_with_leading_dim( ref_key, atol=DEFAULT_ATOL, rtol=DEFAULT_RTOL) + +# test rope with npu_mrope interface with leading dimension and merge seqlen and batch_size as num_tokens +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("seq_len", SEQ_LENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", DEVICES) +@torch.inference_mode() +def test_npu_mrope_quant_with_leading_dim( + is_neox_style: bool, + batch_size: int, + seq_len: int, + num_heads: int, + head_size: int, + rotary_dim: Optional[int], + dtype: torch.dtype, + seed: int, + device: str, + max_position: int = 8192, + base: int = 10000, +) -> None: + if rotary_dim is None: + rotary_dim = head_size + + torch.set_default_device(device) + if rotary_dim is None: + rotary_dim = head_size + rope = RotaryEmbedding(head_size, rotary_dim, max_position, base, + is_neox_style, dtype) + rope = rope.to(dtype=dtype) + num_tokens = batch_size * seq_len + positions = torch.randint(0, max_position, (batch_size * seq_len, )) + qkv_tensor = torch.randn(num_tokens, + num_heads * head_size * 3, + dtype=dtype) + query, key, _ = qkv_tensor.split( + [num_heads * head_size, num_heads * head_size, num_heads * head_size], + dim=-1, + ) + + ref_query, ref_key = rope.forward_native(positions, query, key) + + query, key = torch_npu.npu_mrope( + positions, + query, + key, + rope.cos_sin_cache, + rope.head_size, + mrope_section=[0,0,0], + rotary_mode='half' if rope.is_neox_style else 'interleave' + ) + + # Compare the results. + torch.testing.assert_close(query.view(ref_query.size()), + ref_query, + atol=DEFAULT_ATOL, + rtol=DEFAULT_RTOL) + torch.testing.assert_close(key.view(ref_key.size()), + ref_key, + atol=DEFAULT_ATOL, + rtol=DEFAULT_RTOL) \ No newline at end of file From 97a0908e3646e1d1fc9f6eace54f9cef6efcc36b Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Sun, 15 Jun 2025 12:04:20 +0800 Subject: [PATCH 3/3] Address lint Signed-off-by: Yikun Jiang --- tests/singlecard/ops/test_rotary_embedding.py | 18 +++++++++--------- vllm_ascend/ops/rotary_embedding.py | 5 ++--- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/tests/singlecard/ops/test_rotary_embedding.py b/tests/singlecard/ops/test_rotary_embedding.py index e0692f91e7..a3e8d7f657 100644 --- a/tests/singlecard/ops/test_rotary_embedding.py +++ b/tests/singlecard/ops/test_rotary_embedding.py @@ -198,6 +198,7 @@ def test_rotary_embedding_quant_with_leading_dim( atol=DEFAULT_ATOL, rtol=DEFAULT_RTOL) + # test rope with npu_mrope interface with leading dimension and merge seqlen and batch_size as num_tokens @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @pytest.mark.parametrize("batch_size", BATCH_SIZES) @@ -244,14 +245,13 @@ def test_npu_mrope_quant_with_leading_dim( ref_query, ref_key = rope.forward_native(positions, query, key) query, key = torch_npu.npu_mrope( - positions, - query, - key, - rope.cos_sin_cache, - rope.head_size, - mrope_section=[0,0,0], - rotary_mode='half' if rope.is_neox_style else 'interleave' - ) + positions, + query, + key, + rope.cos_sin_cache, + rope.head_size, + mrope_section=[0, 0, 0], + rotary_mode='half' if rope.is_neox_style else 'interleave') # Compare the results. torch.testing.assert_close(query.view(ref_query.size()), @@ -261,4 +261,4 @@ def test_npu_mrope_quant_with_leading_dim( torch.testing.assert_close(key.view(ref_key.size()), ref_key, atol=DEFAULT_ATOL, - rtol=DEFAULT_RTOL) \ No newline at end of file + rtol=DEFAULT_RTOL) diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index c27b796b1f..81dde8b8bf 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -70,9 +70,8 @@ def rope_forward_oot( key, self.cos_sin_cache, self.head_size, - mrope_section=[0,0,0], - rotary_mode='half' if neox_style else 'interleave' - ) + mrope_section=[0, 0, 0], + rotary_mode='half' if neox_style else 'interleave') return query.view(query_shape), key.view(key_shape)