Skip to content

Commit 610273b

Browse files
committed
enable npu_mrope by environment variables
Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com>
1 parent 38692b5 commit 610273b

File tree

2 files changed

+59
-3
lines changed

2 files changed

+59
-3
lines changed

vllm_ascend/envs.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,12 @@
133133
# value to False to disable the optimized model.
134134
"USE_OPTIMIZED_MODEL":
135135
lambda: bool(int(os.getenv('USE_OPTIMIZED_MODEL', '1'))),
136+
# VLLM_ASCEND_ENABLE_NPU_MROPE:
137+
# 0: using npu_rotary_embedding.
138+
# 1: using npu_mrope.
139+
# Just a temporary plan,will be removed after npu_mrope supports aclgraph mode.
140+
"VLLM_ASCEND_ENABLE_NPU_MROPE":
141+
lambda: bool(int(os.getenv('VLLM_ASCEND_ENABLE_NPU_MROPE', '0'))),
136142
}
137143

138144
# end-env-vars-definition

vllm_ascend/ops/rotary_embedding.py

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from vllm.model_executor.layers.rotary_embedding import (
2323
DeepseekScalingRotaryEmbedding, RotaryEmbedding)
2424

25+
import vllm_ascend.envs as ascend_envs
2526
from vllm_ascend.platform import CUSTOM_OP_ENABLED
2627

2728

@@ -75,6 +76,52 @@ def rope_forward_oot(
7576
return query.view(query_shape), key.view(key_shape)
7677

7778

79+
def rope_forward_oot_npu_mrope(
80+
self,
81+
positions: torch.Tensor,
82+
query: torch.Tensor,
83+
key: torch.Tensor,
84+
offsets: Optional[torch.Tensor] = None,
85+
is_neox_style_override: Optional[bool] = None
86+
) -> Tuple[torch.Tensor, torch.Tensor]:
87+
import torch_npu
88+
query_shape, key_shape = query.shape, key.shape
89+
if self.cos_sin_cache.device != query.device:
90+
self.cos_sin_cache = self.cos_sin_cache.to(query.device)
91+
if self.cos_sin_cache.dtype != query.dtype:
92+
self.cos_sin_cache = self.cos_sin_cache.to(query.dtype)
93+
neox_style = self.is_neox_style
94+
if is_neox_style_override is not None:
95+
neox_style = is_neox_style_override
96+
# adopt custom kernel path for rotary_embedding
97+
if custom_rotary_embedding_enabled(query, neox_style, self.head_size):
98+
query, key = torch.ops._C.rotary_embedding(
99+
positions,
100+
query,
101+
key,
102+
self.head_size,
103+
self.cos_sin_cache,
104+
neox_style,
105+
)
106+
return query.view(query_shape), key.view(key_shape)
107+
if offsets is not None:
108+
raise NotImplementedError(
109+
"Batched rotary embedding is currently not supported on NPU.")
110+
else:
111+
# TODO: Remove the contiguous in the future.
112+
query = query.contiguous().view(query.shape[0], -1)
113+
key = key.contiguous().view(key.shape[0], -1)
114+
query, key = torch_npu.npu_mrope(
115+
positions,
116+
query,
117+
key,
118+
self.cos_sin_cache,
119+
self.head_size,
120+
mrope_section=[0, 0, 0],
121+
rotary_mode="half" if neox_style else "interleave")
122+
return query.view(query_shape), key.view(key_shape)
123+
124+
78125
def native_rope_deepseek_forward(self,
79126
positions: torch.Tensor,
80127
query: torch.Tensor,
@@ -95,8 +142,8 @@ def native_rope_deepseek_forward(self,
95142
2).reshape(b, h_q, d)
96143
b, h_k, d = key.shape
97144
key = key.view(b, h_k, d // 2, 2).transpose(3, 2).reshape(b, h_k, d)
98-
q_pe, k_pe = rope_forward_oot(self, positions, query, key, offsets,
99-
neox_style)
145+
q_pe, k_pe = RotaryEmbedding.forward_oot(self, positions, query, key,
146+
offsets, neox_style)
100147
return q_pe, k_pe
101148

102149

@@ -270,7 +317,10 @@ def deepseek_rope_init_func(
270317
device="npu")
271318

272319

273-
RotaryEmbedding.forward_oot = rope_forward_oot
320+
if not ascend_envs.VLLM_ASCEND_ENABLE_NPU_MROPE:
321+
RotaryEmbedding.forward_oot = rope_forward_oot
322+
else:
323+
RotaryEmbedding.forward_oot = rope_forward_oot_npu_mrope
274324

275325
# Note: we adopt the native huggingface deepseek rope initialization code from
276326
# https://huggingface.co/deepseek-ai/DeepSeek-V3-0324/blob/main/modeling_deepseek.py for

0 commit comments

Comments
 (0)