File tree Expand file tree Collapse file tree 2 files changed +0
-8
lines changed
model_executor/layers/fused_moe Expand file tree Collapse file tree 2 files changed +0
-8
lines changed Original file line number Diff line number Diff line change 104
104
VLLM_SERVER_DEV_MODE : bool = False
105
105
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE : int = 128
106
106
VLLM_MLA_DISABLE : bool = False
107
- VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON : bool = False
108
107
VLLM_RAY_PER_WORKER_GPUS : float = 1.0
109
108
VLLM_RAY_BUNDLE_INDICES : str = ""
110
109
VLLM_CUDART_SO_PATH : Optional [str ] = None
@@ -769,12 +768,6 @@ def get_vllm_port() -> Optional[int]:
769
768
"VLLM_MLA_DISABLE" :
770
769
lambda : bool (int (os .getenv ("VLLM_MLA_DISABLE" , "0" ))),
771
770
772
- # If set, vLLM will use the Triton implementation of moe_align_block_size,
773
- # i.e. moe_align_block_size_triton in fused_moe.py.
774
- "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON" :
775
- lambda : bool (int (os .getenv ("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON" , "0" ))
776
- ),
777
-
778
771
# Number of GPUs per worker in Ray, if it is set to be a fraction,
779
772
# it allows ray to schedule multiple actors on a single GPU,
780
773
# so that users can colocate other actors on the same GPUs as vLLM.
Original file line number Diff line number Diff line change @@ -94,7 +94,6 @@ def moe_align_block_size_stage4(
94
94
95
95
# Triton implementation based on:
96
96
# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
97
- # TODO(wentao): Deprecated this function in the future.
98
97
def moe_align_block_size_triton (
99
98
topk_ids : torch .Tensor ,
100
99
num_experts : int ,
You can’t perform that action at this time.
0 commit comments