File tree Expand file tree Collapse file tree 1 file changed +6
-3
lines changed Expand file tree Collapse file tree 1 file changed +6
-3
lines changed Original file line number Diff line number Diff line change 18
18
from typing import Optional
19
19
20
20
import torch
21
+ import vllm .model_executor .models .qwen3_moe as qwen3
22
+
21
23
from torch import nn
22
24
from vllm .attention import AttentionMetadata
23
25
from vllm .distributed import (get_tensor_model_parallel_world_size ,
@@ -91,13 +93,12 @@ def forward(
91
93
attn_metadata = get_forward_context ().attn_metadata
92
94
# when profile runs, force experts to load balanced tokens
93
95
# to avoid high memory consumption on a single rank.
94
- is_prefill = True
95
96
if attn_metadata is None :
96
97
# for profile run
97
98
is_prefill = True
98
99
enable_force_load_balance = True
99
100
else :
100
- # is_prefill = attn_metadata.num_prefills > 0 is_prefill or
101
+ is_prefill = False
101
102
enable_force_load_balance = False
102
103
if hasattr (attn_metadata , 'with_prefill_across_dp' ):
103
104
is_prefill = attn_metadata .with_prefill_across_dp
@@ -114,4 +115,6 @@ def forward(
114
115
shared_experts = None ,
115
116
)
116
117
117
- return hidden_states
118
+ return hidden_states
119
+
120
+ qwen3 .Qwen3MoeSparseMoeBlock = AscendSparseMoeBlock
You can’t perform that action at this time.
0 commit comments