Skip to content

Commit 9dae7d4

Browse files
authored
[Refactor] Remove Unused Env VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON (#20334)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
1 parent 7058d7d commit 9dae7d4

File tree

2 files changed

+0
-8
lines changed

2 files changed

+0
-8
lines changed

vllm/envs.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,6 @@
104104
VLLM_SERVER_DEV_MODE: bool = False
105105
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
106106
VLLM_MLA_DISABLE: bool = False
107-
VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
108107
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
109108
VLLM_RAY_BUNDLE_INDICES: str = ""
110109
VLLM_CUDART_SO_PATH: Optional[str] = None
@@ -769,12 +768,6 @@ def get_vllm_port() -> Optional[int]:
769768
"VLLM_MLA_DISABLE":
770769
lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
771770

772-
# If set, vLLM will use the Triton implementation of moe_align_block_size,
773-
# i.e. moe_align_block_size_triton in fused_moe.py.
774-
"VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
775-
lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
776-
),
777-
778771
# Number of GPUs per worker in Ray, if it is set to be a fraction,
779772
# it allows ray to schedule multiple actors on a single GPU,
780773
# so that users can colocate other actors on the same GPUs as vLLM.

vllm/model_executor/layers/fused_moe/moe_align_block_size.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@ def moe_align_block_size_stage4(
9494

9595
# Triton implementation based on:
9696
# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
97-
# TODO(wentao): Deprecated this function in the future.
9897
def moe_align_block_size_triton(
9998
topk_ids: torch.Tensor,
10099
num_experts: int,

0 commit comments

Comments
 (0)