File tree Expand file tree Collapse file tree 2 files changed +5
-2
lines changed
vllm/model_executor/layers/fused_moe Expand file tree Collapse file tree 2 files changed +5
-2
lines changed Original file line number Diff line number Diff line change @@ -716,6 +716,9 @@ def apply(
716
716
intermediate_cache2 = _resize_cache (workspace2 ,
717
717
(E , max_num_tokens , N // 2 ))
718
718
719
+ if self .use_fp8_w8a8 :
720
+ intermediate_cache1 .fill_ (0 )
721
+
719
722
# MM1
720
723
invoke_moe_batched_triton_kernel (A = hidden_states ,
721
724
B = w1 ,
Original file line number Diff line number Diff line change @@ -426,10 +426,10 @@ def forward(
426
426
427
427
# We can reuse the memory between cache1 and cache3 because by the
428
428
# time we need cache3, we're done with cache1.
429
- workspace13 = torch .zeros (prod (workspace13_shape ),
429
+ workspace13 = torch .empty (prod (workspace13_shape ),
430
430
device = a1 .device ,
431
431
dtype = workspace_dtype )
432
- workspace2 = torch .zeros (prod (workspace2_shape ),
432
+ workspace2 = torch .empty (prod (workspace2_shape ),
433
433
device = a1 .device ,
434
434
dtype = workspace_dtype )
435
435
You can’t perform that action at this time.
0 commit comments