diff --git a/neural_compressor/common/utils/__init__.py b/neural_compressor/common/utils/__init__.py index 0ded3dcc90d..47923dd3c8c 100644 --- a/neural_compressor/common/utils/__init__.py +++ b/neural_compressor/common/utils/__init__.py @@ -21,27 +21,6 @@ from neural_compressor.common.utils.utility import * -# FIXME: (Yi) REMOVE BELOW CODE -import os - -DEEPSEEK_EXPERTS = 256 -VLLM_TP_SIZE = int(os.getenv("VLLM_TP_SIZE", "8")) -VLLM_EP_SIZE = int(os.getenv("VLLM_EP_SIZE", VLLM_TP_SIZE)) -NUM_EXPERTS_PER_EP_RANK = DEEPSEEK_EXPERTS // VLLM_EP_SIZE # 32 -VLLM_MOE_N_SLICE = int(os.getenv("VLLM_MOE_N_SLICE", 8)) -NUM_EXPERTS_PER_GROUP_PER_RANK = NUM_EXPERTS_PER_EP_RANK // VLLM_MOE_N_SLICE # 4 -FUSED_MOE_EXPERTS = NUM_EXPERTS_PER_GROUP_PER_RANK # 4 - -logger.warning_once( - ( - f"INC uses VLLM_TP_SIZE={VLLM_TP_SIZE},\n" - f"VLLM_EP_SIZE={VLLM_EP_SIZE},\n" - f"NUM_EXPERTS_PER_EP_RANK={NUM_EXPERTS_PER_EP_RANK},\n" - f"VLLM_MOE_N_SLICE={VLLM_MOE_N_SLICE},\n" - f"NUM_EXPERTS_PER_GROUP_PER_RANK={NUM_EXPERTS_PER_GROUP_PER_RANK},\n" - f"FUSED_MOE_EXPERTS={FUSED_MOE_EXPERTS}" - ) -) import sys import pdb diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py index 99a31ffe4e4..6691c272254 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py @@ -32,7 +32,6 @@ get_patched_module_table, get_patched_module_type_table, ) -from neural_compressor.common import utils as inc_utils from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator, INCAcceleratorType deepspeed_exists = False if importlib.util.find_spec("deepspeed"): # check if deepspeed is installed @@ -59,7 +58,7 @@ def maybe_dequant_original_fp8_weight(mod: torch.nn.Module, param: torch.Tensor) "dynamic_moe": ModuleType( 1, [], - inc_utils.FUSED_MOE_EXPERTS + 1, # FIXME (Yi) # one output, FUSED_MOE_EXPERTS weights + 8 + 1, True, ), } @@ -235,12 +234,12 @@ def convert_scales_to_tensors_dict(scales_obj, scales_file_format, hp_dtype, dev "VllmMixtureOfExpertsOpFP8": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8), # FIXME (Yi) revert change "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False), - # "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock), - # "VllmMixtureOfExpertsOp": ( - # ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV2) - # if os.getenv("LOW_CPU_MEM", "0") == "1" - # else ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV1) - # ), + "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock), + "VllmMixtureOfExpertsOp": ( + ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV2) + if os.getenv("LOW_CPU_MEM", "0") == "1" + else ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV1) + ), } diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py index b80b6e8a3e5..cbc3ce944a2 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py @@ -781,6 +781,7 @@ def forward_quant(self, hidden_states, expert_routing_table, router_weights, + layer=None, permuted_weights=True, activation="silu"): experts_range = range(self.num_experts) @@ -810,6 +811,7 @@ def forward_measure(self, hidden_states, expert_routing_table, router_weights, + layer=None, permuted_weights=True, activation="silu"): experts_range = range(self.num_experts)