diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py index 99a31ffe4e4..fa34087dc22 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py @@ -235,12 +235,12 @@ def convert_scales_to_tensors_dict(scales_obj, scales_file_format, hp_dtype, dev "VllmMixtureOfExpertsOpFP8": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8), # FIXME (Yi) revert change "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False), - # "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock), - # "VllmMixtureOfExpertsOp": ( - # ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV2) - # if os.getenv("LOW_CPU_MEM", "0") == "1" - # else ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV1) - # ), + "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock), + "VllmMixtureOfExpertsOp": ( + ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV2) + if os.getenv("LOW_CPU_MEM", "0") == "1" + else ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV1) + ), } diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py index b80b6e8a3e5..cbc3ce944a2 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py @@ -781,6 +781,7 @@ def forward_quant(self, hidden_states, expert_routing_table, router_weights, + layer=None, permuted_weights=True, activation="silu"): experts_range = range(self.num_experts) @@ -810,6 +811,7 @@ def forward_measure(self, hidden_states, expert_routing_table, router_weights, + layer=None, permuted_weights=True, activation="silu"): experts_range = range(self.num_experts)