From 730bf0cb3c12dc43df89169f96644e4ca4735288 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 20 Mar 2025 10:28:54 +0200 Subject: [PATCH 01/28] fix patched mod wip Change-Id: Ibb92d77e312394736cb548f60f7039fd9944adce Signed-off-by: Yi Liu --- .../algorithms/fp8_quant/_core/common.py | 14 ++-- .../algorithms/fp8_quant/_core/measure.py | 2 + .../fp8_quant/_quant_common/helper_modules.py | 67 ++++++++++++------- 3 files changed, 52 insertions(+), 31 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py index 11992ec8d63..775fb690061 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py @@ -224,13 +224,13 @@ def convert_scales_to_tensors_dict(scales_obj, scales_file_format, hp_dtype, dev "MoeMatmul": ModuleInfo("linear", PatchedMoeMatmul), "ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear), # FIXME (Yi) revert change - "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False), - "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock), - "VllmMixtureOfExpertsOp": ( - ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV2) - if os.getenv("LOW_CPU_MEM", "0") == "1" - else ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV1) - ), + # "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False), + # "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock), + # "VllmMixtureOfExpertsOp": ( + # ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV2) + # if os.getenv("LOW_CPU_MEM", "0") == "1" + # else ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV1) + # ), } diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py index 9b1db9d0906..ee620046946 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py @@ -167,6 +167,8 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N # logger.info(f"Pacthed module pmod: {pmod}") if pmod._mod_extra_config: for param_name in pmod._mod_extra_config.params: + # if torch.distributed.get_rank() == 0: + # import pdb; pdb.set_trace() param = getattr(pmod, param_name) if config["measure_on_hpu"]: param = param.to(cur_accelerator.name()) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py index 245b3f08a16..b7b0bf4125c 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py @@ -416,13 +416,22 @@ def forward_quant(self, input): return self.post_all_reduce(dqoutput) def forward_measure(self, input): - resolved_input = self.resolve_input(input) - measure_input((resolved_input,), observer=self._mod_extra_config.inputs) - output = torch.matmul(resolved_input, self.weight.transpose(-1, -2)) + # if torch.distributed.get_rank() == 0: + # import pdb; pdb.set_trace() + # torch.distributed.barrier() + # resolved_input = self.resolve_input(input) + # measure_input((resolved_input,), observer=self._mod_extra_config.inputs) + # output = torch.matmul(resolved_input, self.weight.transpose(-1, -2)) + # measure_output((output,), self._mod_extra_config.outputs) + # if self.reduce_results: + # output = self.collective_func(output) + # return self.post_all_reduce(output) + + # FIXME: It is not fully correct here + measure_input((input,), observer=self._mod_extra_config.inputs) + output, output_bias = self.orig_mod(input) measure_output((output,), self._mod_extra_config.outputs) - if self.reduce_results: - output = self.collective_func(output) - return self.post_all_reduce(output) + return output, output_bias def post_all_reduce(self, output): assert ( @@ -473,12 +482,21 @@ def forward_quant(self, input): return self.post_all_reduce(dqoutput) def forward_measure(self, input): + # if torch.distributed.get_rank() == 0: + # import pdb; pdb.set_trace() measure_input((input,), observer=self._mod_extra_config.inputs) - output = torch.matmul(input, self.weight.transpose(-1, -2)) + # FIXME: it is not fully correct here, + output, output_bias = self.orig_mod(input) measure_output((output,), self._mod_extra_config.outputs) - if self.gather_output: - output = self.collective_func(output) - return self.post_all_reduce(output) + return output, output_bias + + # output = torch.matmul(input, self.weight.transpose(-1, -2)) + + # output = torch.matmul(input, self.weight.transpose(-1, -2)) + # measure_output((output,), self._mod_extra_config.outputs) + # if self.gather_output: + # output = self.collective_func(output) + # return self.post_all_reduce(output) def post_all_reduce(self, output): if not self.skip_bias_add: @@ -576,18 +594,19 @@ def __init__(self, mod, parent, mod_extra_config, *args, **kwargs): super().__init__(mod, parent, mod_extra_config, *args, **kwargs) # remove the MoE weights that are quanted by PatchedMoeMatmul if self.quantization_mode in [QuantMode.QUANTIZE, QuantMode.LOAD]: - if hasattr(mod, "w13_weight"): - delattr(mod, "w13_weight") - setattr(mod, "w13_weight", None) - if hasattr(mod, "w2_weight"): - delattr(mod, "w2_weight") - setattr(self, "w2_weight", None) - if hasattr(mod, "w1_weight"): - delattr(mod, "w1_weight") - setattr(self, "w1_weight", None) - if hasattr(mod, "w3_weight"): - delattr(mod, "w3_weight") - setattr(self, "w3_weight", None) + pass + # if hasattr(mod, "w13_weight"): + # delattr(mod, "w13_weight") + # setattr(mod, "w13_weight", None) + # if hasattr(mod, "w2_weight"): + # delattr(mod, "w2_weight") + # setattr(self, "w2_weight", None) + # if hasattr(mod, "w1_weight"): + # delattr(mod, "w1_weight") + # setattr(self, "w1_weight", None) + # if hasattr(mod, "w3_weight"): + # delattr(mod, "w3_weight") + # setattr(self, "w3_weight", None) self.forward = self.forward_orig @@ -724,8 +743,8 @@ def extra_repr(self) -> str: class PatchedVllmMixtureOfExpertsOpV1(PatchedModuleBase): def __init__(self, mod, parent, mod_extra_config, *args, **kwargs): super().__init__(mod, parent, mod_extra_config, *args, **kwargs) - self.experts_min = self.orig_mod.experts_min - self.experts_max = self.orig_mod.experts_max + self.experts_min = self.orig_mod.experts_min if hasattr(self.orig_mod, "experts_min") else 0 + self.experts_max = self.orig_mod.experts_max if hasattr(self.orig_mod, "experts_max") else 7 if self.quantization_mode in [QuantMode.QUANTIZE, QuantMode.LOAD]: self.forward = self.forward_quant self.dynamic_moe_op = get_quantized_func_wrapper(OP_TYPE.DYNAMIC_MOE_FUSED_WEIGHTS, self.scale_format) From fbd59c2d9079f2791c11936c8dfeec6b510779b1 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 20 Mar 2025 12:40:32 +0200 Subject: [PATCH 02/28] dequant fp8 if needed Change-Id: If65d0253fa8bbc926b5a33b75c2946fc51e49272 Signed-off-by: Yi Liu --- .../torch/algorithms/fp8_quant/_core/common.py | 8 ++++++++ .../torch/algorithms/fp8_quant/_core/measure.py | 3 ++- .../torch/algorithms/fp8_quant/_core/quantize.py | 3 ++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py index 775fb690061..49a067b9305 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py @@ -42,6 +42,14 @@ INFO_INTERVAL = 30 # seconds +def maybe_dequant_original_fp8_weight(mod: torch.nn.Module, param: torch.Tensor): + if param.dtype in [torch.float8_e4m3fn]: + if hasattr(mod, "get_post_process_weights_func"): + post_process_weights_func = mod.get_post_process_weights_func() + if post_process_weights_func is not None: + param = post_process_weights_func(mod) + return param + _mod_types = { "linear": ModuleType(1, ["weight"], 1, False), "matmul": ModuleType(2, [], 1, False), diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py index ee620046946..36ea0616f94 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py @@ -22,7 +22,7 @@ import time from .._quant_common.quant_config import MeasureExclude, QuantMode, ScaleMethod, get_hqt_config, set_hqt_config # from ..utils.logger import logger -from neural_compressor.torch.algorithms.fp8_quant._core.common import INFO_INTERVAL +from neural_compressor.torch.algorithms.fp8_quant._core.common import INFO_INTERVAL, maybe_dequant_original_fp8_weight from .common import * from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator from neural_compressor.torch.algorithms.fp8_quant.model_configs import ( @@ -170,6 +170,7 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N # if torch.distributed.get_rank() == 0: # import pdb; pdb.set_trace() param = getattr(pmod, param_name) + param = maybe_dequant_original_fp8_weight(pmod.orig_mod, param) if config["measure_on_hpu"]: param = param.to(cur_accelerator.name()) pmod._mod_extra_config.params[param_name].measure(param) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py index 83ae9f7ea27..31e41647955 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py @@ -33,7 +33,7 @@ import time cur_accelerator = auto_detect_accelerator() -from neural_compressor.torch.algorithms.fp8_quant._core.common import INFO_INTERVAL +from neural_compressor.torch.algorithms.fp8_quant._core.common import INFO_INTERVAL, maybe_dequant_original_fp8_weight @torch.no_grad() @@ -78,6 +78,7 @@ def quantize_params(mod, mod_extra_config): param = getattr(mod, param_name) if param.dtype == torch.float16: param = param.to(torch.bfloat16) + param = maybe_dequant_original_fp8_weight(mod, param) quantized_param = quantizer(param.to(cur_accelerator.name())) delattr(mod, param_name) setattr(mod, param_name, nn.Parameter(quantized_param)) From 8328996a322556586d92c553ea07971435c07bff Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 09:22:06 +0200 Subject: [PATCH 03/28] add patched mod for fp8 ds Change-Id: I3e6893be05ba36eb67e367a775fde5ee93d3fa7e Signed-off-by: Yi Liu --- neural_compressor/common/utils/__init__.py | 2 +- .../algorithms/fp8_quant/_core/common.py | 2 + .../algorithms/fp8_quant/_core/quantize.py | 9 +- .../fp8_quant/_quant_common/helper_modules.py | 116 +++++++++++++++++- 4 files changed, 121 insertions(+), 8 deletions(-) diff --git a/neural_compressor/common/utils/__init__.py b/neural_compressor/common/utils/__init__.py index 5b4a8043ff1..93133e30571 100644 --- a/neural_compressor/common/utils/__init__.py +++ b/neural_compressor/common/utils/__init__.py @@ -28,7 +28,7 @@ VLLM_TP_SIZE = int(os.getenv("VLLM_TP_SIZE", "8")) VLLM_EP_SIZE = int(os.getenv("VLLM_EP_SIZE", VLLM_TP_SIZE)) NUM_EXPERTS_PER_EP_RANK = DEEPSEEK_EXPERTS // VLLM_EP_SIZE # 32 -NUM_EXPERTS_GROUPS = 8 +NUM_EXPERTS_GROUPS = int(os.getenv("NUM_EXPERTS_GROUPS", 8)) NUM_EXPERTS_PER_GROUP_PER_RANK = NUM_EXPERTS_PER_EP_RANK // NUM_EXPERTS_GROUPS # 4 FUSED_MOE_EXPERTS = NUM_EXPERTS_PER_GROUP_PER_RANK # 4 diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py index 49a067b9305..d127e5c6a3f 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py @@ -230,7 +230,9 @@ def convert_scales_to_tensors_dict(scales_obj, scales_file_format, hp_dtype, dev "Softmax": ModuleInfo("softmax", PatchedSoftmax), "ModuleFusedSDPA": ModuleInfo("fused_sdpa", PatchedModuleFusedSDPA), "MoeMatmul": ModuleInfo("linear", PatchedMoeMatmul), + "MoeFP8Matmul": ModuleInfo("linear", PatchedMoeFP8Matmul), "ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear), + "DynamicMoeRuntimeDequantFP8": ModuleInfo("dynamic_moe", PatchedDynamicMoeRuntimeDequantFP8), # FIXME (Yi) revert change # "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False), # "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock), diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py index 31e41647955..f6b4ff87cf8 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py @@ -180,14 +180,17 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name, logger.debug("Patched modules: %s", patched_modules) logger.debug("Total patched modules: %d", len(patched_modules)) model = model.to(cur_accelerator.name()) - for _, mod in model.named_modules(): - if hasattr(mod, "post_process"): - mod.post_process() + postporcess_after_convert_(model) torch.distributed.barrier() convert_fp16_to_bf16(model) cur_accelerator.synchronize() +def postporcess_after_convert_(model): + for _, mod in model.named_modules(): + if hasattr(mod, "post_process"): + mod.post_process() + def prepare_model_with_dummy_measurement(model, mod_list, scaling_method_name, scale_config): """Aim for loading, replace module with patched module for model on meta device. diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py index b7b0bf4125c..31eea4b9994 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py @@ -651,7 +651,16 @@ def extra_repr(self) -> str: get_current_repr(self, "scale_input", "scale_weight"), ) - +class PatchedMoeFP8Matmul(PatchedMoeMatmul): + def __init__(self, mod, parent, mod_extra_config, *args, **kwargs): + super().__init__(mod, parent, mod_extra_config, *args, **kwargs) + # if torch.distributed.get_rank() == 0: + # import pdb; pdb.set_trace() + # torch.distributed.barrier() + # self.block_size = self.orig_mod.block_size + # self.scale_inv_fp8 = self.orig_mod.scale_inv_fp8 + self.get_dequant_weight = self.orig_mod.get_dequant_weight + class PatchedGaudiMixtralSparseMoeBlock(PatchedModuleBase): def __init__(self, mod, parent, mod_extra_config, *args, **kwargs): super().__init__(mod, parent, mod_extra_config, *args, **kwargs) @@ -756,11 +765,18 @@ def __init__(self, mod, parent, mod_extra_config, *args, **kwargs): [mod_extra_config.scale.inputs[x] for x in range(1, self.num_experts+1)], self.scale_format, ) - for i in range(self.num_experts): - self.w13_list[i].weight = self.w13_list[i].weight.squeeze().t().contiguous() - self.w2_list[i].weight = self.w2_list[i].weight.squeeze().t().contiguous() + # if torch.distributed.get_rank() == 0: + # import pdb; pdb.set_trace() + # torch.distributed.barrier() + self._post_init_for_quant() + elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE): self.forward = self.forward_measure + + def _post_init_for_quant(self): + for i in range(self.num_experts): + self.w13_list[i].weight = self.w13_list[i].weight.squeeze().t().contiguous() + self.w2_list[i].weight = self.w2_list[i].weight.squeeze().t().contiguous() def forward_quant(self, hidden_states, @@ -832,6 +848,98 @@ def extra_repr(self) -> str: f"quant_mode:{quant_mode}, {get_current_repr(self, *member_names)}", ) +class PatchedDynamicMoeRuntimeDequantFP8(PatchedVllmMixtureOfExpertsOpV1): + def _post_init_for_quant(self): + pass + + def post_process(self): + # if torch.distributed.get_rank() == 0: + # import pdb; pdb.set_trace() + # torch.distributed.barrier() + for i in range(self.num_experts): + self.w13_list[i].weight = torch.nn.Parameter(self.w13_list[i].weight.squeeze().t().contiguous()) + self.w2_list[i].weight = torch.nn.Parameter(self.w2_list[i].weight.squeeze().t().contiguous()) + + def forward_measure( + self, + x, + topk_ids, + topk_weights, + moe_n_slice, + n_expert_slice, + ep_shift, + ): + hidden_states = x + measure_input((hidden_states,), observer=self._mod_extra_config.inputs) + # Assume moe_n_slice is 1 + assert moe_n_slice == 1, f"moe_n_slice is {moe_n_slice}, expected 1" + i = 0 + # for i in range(moe_n_slice): + min_expert = i * n_expert_slice + max_expert = (i + 1) * n_expert_slice + w13_list_slice = [] + w2_list_slice = [] + for j in range(min_expert, max_expert): + w13_list_slice.append(self.w13_list[j].get_dequant_weight()) + w2_list_slice.append(self.w2_list[j].get_dequant_weight()) + + output, intermidiate_amax = torch.ops.hpu.mixture_of_experts.fp8_measurement_fused_weights( + hidden_states=x, + expert_routing_table=topk_ids.to(torch.int64), + router_weights=topk_weights.to(x.dtype), + w12=w13_list_slice, + w3=w2_list_slice, + permuted_weights=True, + activation="silu", + experts_min=min_expert + ep_shift, + experts_max=max_expert - 1 + ep_shift, + measurement_mode=True, # <============= + ) + output_measure_list = [output] + # if torch.distributed.get_rank() == 0: + # import pdb; pdb.set_trace() + # torch.distributed.barrier() + for i in range(self.num_experts): + output_measure_list.append(intermidiate_amax[i]) + measure_output(output_measure_list, self._mod_extra_config.outputs) + return output + + def forward_quant( + self, + x, + topk_ids, + topk_weights, + moe_n_slice, + n_expert_slice, + ep_shift=None, + ): + hidden_states = x + expert_routing_table = topk_ids.to(torch.int64) + router_weights = topk_weights.to(x.dtype) + permuted_weights = True + activation = "silu" + experts_range = range(self.num_experts) + w1_list = [self.w13_list[i].weight for i in experts_range] + w2_list = [self.w2_list[i].weight for i in experts_range] + scale_w1 = [self.w13_list[i].scale_weight for i in experts_range] + scale_w2 = [self.w2_list[i].scale_weight for i in experts_range] + qinput = self.quant_input(hidden_states) + output = self.dynamic_moe_op( + hidden_states=qinput, + expert_routing_table=expert_routing_table, + router_weights=router_weights, + w12=w1_list, + w3=w2_list, + d_scale_w12=scale_w1, + d_scale_w3=scale_w2, + d_scale_hidden_states=self.scale_input, + d_scale_intermediate_hidden_states=self.scale_intermediate, + permuted_weights=permuted_weights, + activation=activation, + experts_min=self.experts_min, + experts_max=self.experts_max, + ) + return output class PatchedVllmMixtureOfExpertsOpV2(PatchedVllmMixtureOfExpertsOpV1): def __init__(self, mod, parent, mod_extra_config, *args, **kwargs): From eb9e4ed089f70757b38c0d46fb03970b1a243cda Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 12:22:24 +0200 Subject: [PATCH 04/28] add fused moe back Change-Id: I5d3b6fa90b2d19fc928791bb9d17b188851f230e Signed-off-by: Yi Liu --- .../algorithms/fp8_quant/_core/common.py | 2 +- .../algorithms/fp8_quant/_core/quantize.py | 1 + .../fp8_quant/_quant_common/helper_modules.py | 25 +++++++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py index d127e5c6a3f..fda18e9c2d6 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py @@ -234,7 +234,7 @@ def convert_scales_to_tensors_dict(scales_obj, scales_file_format, hp_dtype, dev "ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear), "DynamicMoeRuntimeDequantFP8": ModuleInfo("dynamic_moe", PatchedDynamicMoeRuntimeDequantFP8), # FIXME (Yi) revert change - # "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False), + "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False), # "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock), # "VllmMixtureOfExpertsOp": ( # ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV2) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py index f6b4ff87cf8..912ec5f504a 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py @@ -78,6 +78,7 @@ def quantize_params(mod, mod_extra_config): param = getattr(mod, param_name) if param.dtype == torch.float16: param = param.to(torch.bfloat16) + logger.debug(f"Quantizing parameter {param_name} of module {mod.__class__.__name__}") param = maybe_dequant_original_fp8_weight(mod, param) quantized_param = quantizer(param.to(cur_accelerator.name())) delattr(mod, param_name) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py index 31eea4b9994..07dc0b68f5e 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py @@ -594,19 +594,18 @@ def __init__(self, mod, parent, mod_extra_config, *args, **kwargs): super().__init__(mod, parent, mod_extra_config, *args, **kwargs) # remove the MoE weights that are quanted by PatchedMoeMatmul if self.quantization_mode in [QuantMode.QUANTIZE, QuantMode.LOAD]: - pass - # if hasattr(mod, "w13_weight"): - # delattr(mod, "w13_weight") - # setattr(mod, "w13_weight", None) - # if hasattr(mod, "w2_weight"): - # delattr(mod, "w2_weight") - # setattr(self, "w2_weight", None) - # if hasattr(mod, "w1_weight"): - # delattr(mod, "w1_weight") - # setattr(self, "w1_weight", None) - # if hasattr(mod, "w3_weight"): - # delattr(mod, "w3_weight") - # setattr(self, "w3_weight", None) + if hasattr(mod, "w13_weight"): + delattr(mod, "w13_weight") + setattr(mod, "w13_weight", None) + if hasattr(mod, "w2_weight"): + delattr(mod, "w2_weight") + setattr(self, "w2_weight", None) + if hasattr(mod, "w1_weight"): + delattr(mod, "w1_weight") + setattr(self, "w1_weight", None) + if hasattr(mod, "w3_weight"): + delattr(mod, "w3_weight") + setattr(self, "w3_weight", None) self.forward = self.forward_orig From 204bd992e84f5d8bc4fdbcc078bdebe9fc6e5ff5 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 13:28:43 +0200 Subject: [PATCH 05/28] show mem after patch Change-Id: Idf7dbfecd6a87dd8c48fad8fc7d31a689ddb4fbd Signed-off-by: Yi Liu --- neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py index 912ec5f504a..e47f3501213 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py @@ -174,6 +174,9 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name, patched_module_types.add(type(mod)) htcore.mark_step() logger.debug("Patched module name: %s", name) + cur_accelerator.synchronize() + logger.info("Patched module name: %s", name) + show_mem_info() if save_file: # cache calculated scales save_scales(model, scales_obj, scales_file_format, scale_file + ".npz") save_scales(model, scales_obj, scales_file_format, scale_file + ".json") From 18f51b22414e05f54b98ffb01b2eccb8aa4788f3 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 15:39:40 +0200 Subject: [PATCH 06/28] debug Change-Id: Ic6954975f8bb1a1507acb5e86807d8fdd21f39dc Signed-off-by: Yi Liu --- .../algorithms/fp8_quant/_core/quantize.py | 23 +++++++++++++++++++ .../fp8_quant/_quant_common/helper_modules.py | 3 +++ neural_compressor/torch/utils/environ.py | 11 +++++---- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py index e47f3501213..9fc60f26102 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py @@ -183,11 +183,34 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name, logger.debug("Patched module types: %s", patched_module_types) logger.debug("Patched modules: %s", patched_modules) logger.debug("Total patched modules: %d", len(patched_modules)) + + + def inspect_tensor(tensor, msg=""): + + if "cpu" in str(tensor.device): + logger.info(f"{msg}: tensor dtype: {tensor.dtype}, tensor shape: {tensor.shape}, deveice: {tensor.device}") + # if torch.distributed.get_rank() == 0: + # import pdb; pdb.set_trace() + # torch.distributed.barrier() + + for pname, param in model.named_parameters(): + inspect_tensor(param, pname) + # check buffer + for bname, buffer in model.named_buffers(): + inspect_tensor(buffer, bname) + show_mem_info("before move all") model = model.to(cur_accelerator.name()) + show_mem_info("after move all") postporcess_after_convert_(model) + show_mem_info("after post process") torch.distributed.barrier() convert_fp16_to_bf16(model) + show_mem_info("after convert_fp16_to_bf16") cur_accelerator.synchronize() + show_mem_info("after synchronize") + if torch.distributed.get_rank() == 0: + import pdb; pdb.set_trace() + torch.distributed.barrier() def postporcess_after_convert_(model): diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py index 07dc0b68f5e..a0453253f95 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py @@ -917,6 +917,9 @@ def forward_quant( router_weights = topk_weights.to(x.dtype) permuted_weights = True activation = "silu" + if torch.distributed.get_rank() == 0: + import pdb; pdb.set_trace() + torch.distributed.barrier() experts_range = range(self.num_experts) w1_list = [self.w13_list[i].weight for i in experts_range] w2_list = [self.w2_list[i].weight for i in experts_range] diff --git a/neural_compressor/torch/utils/environ.py b/neural_compressor/torch/utils/environ.py index e60dcc7ad88..623d3446232 100644 --- a/neural_compressor/torch/utils/environ.py +++ b/neural_compressor/torch/utils/environ.py @@ -235,15 +235,18 @@ def is_tbb_available(): # pragma: no cover return False return True -def show_mem_info(loglevel="info"): +def show_mem_info(msg="", loglevel="info"): hpu_mem_mb = get_used_hpu_mem_MB() from neural_compressor.common.utils import logger show_fn = getattr(logger, loglevel) rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else -1 - show_fn(f"[Rank {rank}] Used HPU memory: {hpu_mem_mb // 1000} GB {hpu_mem_mb % 1000} MB") + # show_fn(f"[Rank {rank}] Used HPU memory: {hpu_mem_mb // 1000} GB {hpu_mem_mb % 1000} MB") cpu_mem_mb = get_used_cpu_mem_MB() - show_fn(f"[Rank {rank}] Used CPU memory: {cpu_mem_mb // 1000} GB {cpu_mem_mb % 1000} MB") - + # show_fn(f"[Rank {rank}] Used CPU memory: {cpu_mem_mb // 1000} GB {cpu_mem_mb % 1000} MB") + show_fn( + f"[Rank {rank}] {msg}, HPU: {hpu_mem_mb // 1000} GB {hpu_mem_mb % 1000:.2f} MB; CPU: {cpu_mem_mb // 1000} GB {cpu_mem_mb % 1000:.2f} MB" + ) + def get_used_hpu_mem_MB(): """Get HPU used memory: MiB.""" From 472f0e27aa4c5fa7ff18a585c05f7edeca541a20 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 15:40:25 +0200 Subject: [PATCH 07/28] update Change-Id: Id225051fdede1b5e1def62083a7c5f93af5301cb Signed-off-by: Yi Liu --- neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py index 9fc60f26102..c2d5b1b0222 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py @@ -176,7 +176,7 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name, logger.debug("Patched module name: %s", name) cur_accelerator.synchronize() logger.info("Patched module name: %s", name) - show_mem_info() + # show_mem_info() if save_file: # cache calculated scales save_scales(model, scales_obj, scales_file_format, scale_file + ".npz") save_scales(model, scales_obj, scales_file_format, scale_file + ".json") From e25d65826976ed1c32fde575baecf6314fdce006 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 15:46:39 +0200 Subject: [PATCH 08/28] sync for each tran Change-Id: I8cdf01349ce58ecc608511d970cedf6b33ab3045 Signed-off-by: Yi Liu --- .../algorithms/fp8_quant/_quant_common/helper_modules.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py index a0453253f95..a16df85d5d4 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py @@ -858,6 +858,7 @@ def post_process(self): for i in range(self.num_experts): self.w13_list[i].weight = torch.nn.Parameter(self.w13_list[i].weight.squeeze().t().contiguous()) self.w2_list[i].weight = torch.nn.Parameter(self.w2_list[i].weight.squeeze().t().contiguous()) + htcore.mark_step() def forward_measure( self, @@ -917,9 +918,9 @@ def forward_quant( router_weights = topk_weights.to(x.dtype) permuted_weights = True activation = "silu" - if torch.distributed.get_rank() == 0: - import pdb; pdb.set_trace() - torch.distributed.barrier() + # if torch.distributed.get_rank() == 0: + # import pdb; pdb.set_trace() + # torch.distributed.barrier() experts_range = range(self.num_experts) w1_list = [self.w13_list[i].weight for i in experts_range] w2_list = [self.w2_list[i].weight for i in experts_range] From 215d31c794b2254abb13f0ca1c6d9bf29b9eabde Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 15:54:23 +0200 Subject: [PATCH 09/28] fix OoM Change-Id: I9c9a72115a72a9cdc762d084227cf114588a1ef7 Signed-off-by: Yi Liu --- .../torch/algorithms/fp8_quant/_core/quantize.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py index c2d5b1b0222..967e3fe8ac9 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py @@ -208,15 +208,13 @@ def inspect_tensor(tensor, msg=""): show_mem_info("after convert_fp16_to_bf16") cur_accelerator.synchronize() show_mem_info("after synchronize") - if torch.distributed.get_rank() == 0: - import pdb; pdb.set_trace() - torch.distributed.barrier() - def postporcess_after_convert_(model): for _, mod in model.named_modules(): if hasattr(mod, "post_process"): mod.post_process() + # Note: It is very important to synchronize after each post_process to avoid OoM. + cur_accelerator.synchronize() def prepare_model_with_dummy_measurement(model, mod_list, scaling_method_name, scale_config): """Aim for loading, replace module with patched module for model on meta device. From e8d44f4494a9b5a257c6f453496fd32a903d2215 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 15:55:07 +0200 Subject: [PATCH 10/28] refine log Change-Id: I664f0935e3afa076c495acabdd2cc8e63ab38965 Signed-off-by: Yi Liu --- .../torch/algorithms/fp8_quant/_core/quantize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py index 967e3fe8ac9..e725a1bd064 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py @@ -167,7 +167,7 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name, scale_config, save_file) if not config.cfg["fake_quant"] and mod_default_dict[mod_type_str].should_measure_and_quant: quantize_params(mod, mod_extra_config) - logger.debug(f"patching module {name}") + # logger.debug(f"patching module {name}") patch_module(mod, mod_extra_config, mod_default_dict) name = origin_name patched_modules.append(name) @@ -175,7 +175,7 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name, htcore.mark_step() logger.debug("Patched module name: %s", name) cur_accelerator.synchronize() - logger.info("Patched module name: %s", name) + # logger.info("Patched module name: %s", name) # show_mem_info() if save_file: # cache calculated scales save_scales(model, scales_obj, scales_file_format, scale_file + ".npz") From 13df72f692df09e3106c2e875af171d66bccdd89 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 16:07:41 +0200 Subject: [PATCH 11/28] brr after convert Change-Id: I95fdb8028e58cd4ba9b642b9af716d39b4cb7283 Signed-off-by: Yi Liu --- neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py index e725a1bd064..8de3aeb51c2 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py @@ -203,11 +203,11 @@ def inspect_tensor(tensor, msg=""): show_mem_info("after move all") postporcess_after_convert_(model) show_mem_info("after post process") - torch.distributed.barrier() convert_fp16_to_bf16(model) show_mem_info("after convert_fp16_to_bf16") cur_accelerator.synchronize() show_mem_info("after synchronize") + torch.distributed.barrier() def postporcess_after_convert_(model): for _, mod in model.named_modules(): From 325f69e8809514790bc210fdeab0b300cd09bca8 Mon Sep 17 00:00:00 2001 From: Yi Date: Mon, 24 Mar 2025 04:47:50 +0200 Subject: [PATCH 12/28] remove NUM_EXPERTS_GROUPS --- neural_compressor/common/utils/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/neural_compressor/common/utils/__init__.py b/neural_compressor/common/utils/__init__.py index 93133e30571..0ded3dcc90d 100644 --- a/neural_compressor/common/utils/__init__.py +++ b/neural_compressor/common/utils/__init__.py @@ -28,8 +28,8 @@ VLLM_TP_SIZE = int(os.getenv("VLLM_TP_SIZE", "8")) VLLM_EP_SIZE = int(os.getenv("VLLM_EP_SIZE", VLLM_TP_SIZE)) NUM_EXPERTS_PER_EP_RANK = DEEPSEEK_EXPERTS // VLLM_EP_SIZE # 32 -NUM_EXPERTS_GROUPS = int(os.getenv("NUM_EXPERTS_GROUPS", 8)) -NUM_EXPERTS_PER_GROUP_PER_RANK = NUM_EXPERTS_PER_EP_RANK // NUM_EXPERTS_GROUPS # 4 +VLLM_MOE_N_SLICE = int(os.getenv("VLLM_MOE_N_SLICE", 8)) +NUM_EXPERTS_PER_GROUP_PER_RANK = NUM_EXPERTS_PER_EP_RANK // VLLM_MOE_N_SLICE # 4 FUSED_MOE_EXPERTS = NUM_EXPERTS_PER_GROUP_PER_RANK # 4 logger.warning_once( @@ -37,7 +37,7 @@ f"INC uses VLLM_TP_SIZE={VLLM_TP_SIZE},\n" f"VLLM_EP_SIZE={VLLM_EP_SIZE},\n" f"NUM_EXPERTS_PER_EP_RANK={NUM_EXPERTS_PER_EP_RANK},\n" - f"NUM_EXPERTS_GROUPS={NUM_EXPERTS_GROUPS},\n" + f"VLLM_MOE_N_SLICE={VLLM_MOE_N_SLICE},\n" f"NUM_EXPERTS_PER_GROUP_PER_RANK={NUM_EXPERTS_PER_GROUP_PER_RANK},\n" f"FUSED_MOE_EXPERTS={FUSED_MOE_EXPERTS}" ) From ea8c842ab9855e67ffbef69e2403aec18cc3ed5f Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 24 Mar 2025 13:34:17 +0800 Subject: [PATCH 13/28] Refine R1 WOQ Requant (#2150) * rename moe op Signed-off-by: Yi * refine moe Signed-off-by: Yi --------- Signed-off-by: Yi --- .../torch/algorithms/fp8_quant/_core/common.py | 2 +- .../fp8_quant/_quant_common/helper_modules.py | 16 +++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py index fda18e9c2d6..20ea48ddd55 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py @@ -232,7 +232,7 @@ def convert_scales_to_tensors_dict(scales_obj, scales_file_format, hp_dtype, dev "MoeMatmul": ModuleInfo("linear", PatchedMoeMatmul), "MoeFP8Matmul": ModuleInfo("linear", PatchedMoeFP8Matmul), "ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear), - "DynamicMoeRuntimeDequantFP8": ModuleInfo("dynamic_moe", PatchedDynamicMoeRuntimeDequantFP8), + "VllmMixtureOfExpertsOpFP8": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8), # FIXME (Yi) revert change "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False), # "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock), diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py index a16df85d5d4..a2cd3ac258f 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py @@ -847,7 +847,7 @@ def extra_repr(self) -> str: f"quant_mode:{quant_mode}, {get_current_repr(self, *member_names)}", ) -class PatchedDynamicMoeRuntimeDequantFP8(PatchedVllmMixtureOfExpertsOpV1): +class PatchedVllmMixtureOfExpertsOpFP8(PatchedVllmMixtureOfExpertsOpV1): def _post_init_for_quant(self): pass @@ -871,15 +871,13 @@ def forward_measure( ): hidden_states = x measure_input((hidden_states,), observer=self._mod_extra_config.inputs) - # Assume moe_n_slice is 1 + # FIXME: (Yi) Assume moe_n_slice is 1, remove it? assert moe_n_slice == 1, f"moe_n_slice is {moe_n_slice}, expected 1" - i = 0 - # for i in range(moe_n_slice): - min_expert = i * n_expert_slice - max_expert = (i + 1) * n_expert_slice + min_expert = self.experts_min + max_expert = self.experts_max w13_list_slice = [] w2_list_slice = [] - for j in range(min_expert, max_expert): + for j in range(self.num_experts): w13_list_slice.append(self.w13_list[j].get_dequant_weight()) w2_list_slice.append(self.w2_list[j].get_dequant_weight()) @@ -891,8 +889,8 @@ def forward_measure( w3=w2_list_slice, permuted_weights=True, activation="silu", - experts_min=min_expert + ep_shift, - experts_max=max_expert - 1 + ep_shift, + experts_min=min_expert, + experts_max=max_expert, measurement_mode=True, # <============= ) output_measure_list = [output] From 653ca69158e1e6f6c078786fe4d53dc9bf78cfbb Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 24 Mar 2025 11:16:07 +0200 Subject: [PATCH 14/28] rename func names Change-Id: I1ebeb32c9c8cae1d6ceb400729c6dced5d56a9a5 Signed-off-by: Yi Liu --- .../torch/algorithms/fp8_quant/_core/common.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py index 20ea48ddd55..99a31ffe4e4 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py @@ -44,10 +44,10 @@ def maybe_dequant_original_fp8_weight(mod: torch.nn.Module, param: torch.Tensor): if param.dtype in [torch.float8_e4m3fn]: - if hasattr(mod, "get_post_process_weights_func"): - post_process_weights_func = mod.get_post_process_weights_func() - if post_process_weights_func is not None: - param = post_process_weights_func(mod) + if hasattr(mod, "get_dequant_weights_func"): + dequant_weights_func = mod.get_dequant_weights_func() + if dequant_weights_func is not None: + param = dequant_weights_func(mod) return param _mod_types = { From be597d5b5696fe8e8be9156ec6bf22905bfc9c97 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Mar 2025 16:05:13 +0200 Subject: [PATCH 15/28] remove debug info Change-Id: I57dc09d7c7db1178ae2a55ec74c6de9c8d488e24 Signed-off-by: Yi Liu --- .../torch/algorithms/fp8_quant/_core/quantize.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py index 8de3aeb51c2..c16f59a74f2 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py @@ -175,8 +175,6 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name, htcore.mark_step() logger.debug("Patched module name: %s", name) cur_accelerator.synchronize() - # logger.info("Patched module name: %s", name) - # show_mem_info() if save_file: # cache calculated scales save_scales(model, scales_obj, scales_file_format, scale_file + ".npz") save_scales(model, scales_obj, scales_file_format, scale_file + ".json") @@ -184,20 +182,6 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name, logger.debug("Patched modules: %s", patched_modules) logger.debug("Total patched modules: %d", len(patched_modules)) - - def inspect_tensor(tensor, msg=""): - - if "cpu" in str(tensor.device): - logger.info(f"{msg}: tensor dtype: {tensor.dtype}, tensor shape: {tensor.shape}, deveice: {tensor.device}") - # if torch.distributed.get_rank() == 0: - # import pdb; pdb.set_trace() - # torch.distributed.barrier() - - for pname, param in model.named_parameters(): - inspect_tensor(param, pname) - # check buffer - for bname, buffer in model.named_buffers(): - inspect_tensor(buffer, bname) show_mem_info("before move all") model = model.to(cur_accelerator.name()) show_mem_info("after move all") From f72e7828827c94e286062cb3a673c6d1d1167bfe Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 31 Mar 2025 16:39:44 +0300 Subject: [PATCH 16/28] force install pt Change-Id: I8195c4ac812396319843c14b1af9721857516fca Signed-off-by: Yi Liu --- setup.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index d0be2291d6f..c628b1037c2 100644 --- a/setup.py +++ b/setup.py @@ -102,7 +102,9 @@ def get_build_version(): # https://github.com/pytorch/pytorch/pull/114662 ext_modules = [] cmdclass = {} - + + + if "pt" in sys.argv: sys.argv.remove("pt") cfg_key = "neural_compressor_pt" @@ -110,7 +112,9 @@ def get_build_version(): if "tf" in sys.argv: sys.argv.remove("tf") cfg_key = "neural_compressor_tf" - + # FIXME: (Yi) force install neural_compressor_pt + print(f"Forcing install neural_compressor_pt") + cfg_key = "neural_compressor_pt" project_name = PKG_INSTALL_CFG[cfg_key].get("project_name") include_packages = PKG_INSTALL_CFG[cfg_key].get("include_packages") or {} package_data = PKG_INSTALL_CFG[cfg_key].get("package_data") or {} From c1f938a05d1d78c83a27bc3775e302d2d4b5cec7 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 31 Mar 2025 16:49:16 +0300 Subject: [PATCH 17/28] install all reqs Change-Id: Ibf48a8bc7c9eb9947345fdf617ccf483f70fb2b8 Signed-off-by: Yi Liu --- setup.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c628b1037c2..5d2fe1f9f4a 100644 --- a/setup.py +++ b/setup.py @@ -75,7 +75,12 @@ def get_build_version(): ], ), "package_data": {"": ["*.json"]}, - "install_requires": fetch_requirements("requirements_pt.txt"), + # FIXME: (Yi) force install neural_compressor_pt + # "install_requires": fetch_requirements("requirements_pt.txt"), + "install_requires": fetch_requirements("requirements.txt"), + "extras_require": { + "pt": fetch_requirements("requirements_pt.txt"), + } }, # 3.x tf binary build config, pip install neural-compressor-tf, install 3.x TensorFlow API. "neural_compressor_tf": { From c748f5e7f143ff0f50ac9ea05a54364eb73866f2 Mon Sep 17 00:00:00 2001 From: Yi Date: Tue, 1 Apr 2025 06:33:13 +0300 Subject: [PATCH 18/28] update dequant func --- .../torch/algorithms/fp8_quant/_core/quantize.py | 1 + .../fp8_quant/_core/scale_methods/ops_quantizer.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py index c16f59a74f2..b73a69ed466 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py @@ -83,6 +83,7 @@ def quantize_params(mod, mod_extra_config): quantized_param = quantizer(param.to(cur_accelerator.name())) delattr(mod, param_name) setattr(mod, param_name, nn.Parameter(quantized_param)) + mod._updated_weight = True quantized_param = getattr(mod, param_name) quantized_param.requires_grad_(False) cur_accelerator.synchronize() diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py index 738c12e0ff7..fe23a92a6c3 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py @@ -20,7 +20,8 @@ from .scales_method import QuantTensorType from ..quant_dequant import DequantOutput, QuantDequant, QuantDequantNone, QuantInput from neural_compressor.common import utils as inc_utils - +# from neural_compressor.torch.algorithms.fp8_quant.utils import +from neural_compressor.torch.algorithms.fp8_quant._core.common import maybe_dequant_original_fp8_weight class BaseOpQuantizer: def __init__(self, config, mod, measurement, params, op_type): @@ -96,7 +97,8 @@ def get_scales_module_config(self): rescaled_weight = self.mod.weight if hasattr(self.mod, 'weight') else None if self.weight_ich_scale_calc is not None: weight_scales_in_ch = self.weight_ich_scale_calc.calc_scales(input_scales[0], QuantTensorType.CONST) - rescaled_weight = torch.div(self.mod.weight, weight_scales_in_ch.reshape([1, -1])) + bf16_weight = maybe_dequant_original_fp8_weight(self.mod, self.mod.weight) + rescaled_weight = torch.div(bf16_weight, weight_scales_in_ch.reshape([1, -1])) weights_scales_out_ch = self.weight_och_scale_calc.calc_scales(rescaled_weight, QuantTensorType.CONST) params_config = ( {"weight": weights_scales_out_ch} From eb5f04d93d46991b89a1af82296d960b70f44d73 Mon Sep 17 00:00:00 2001 From: Yi Date: Tue, 1 Apr 2025 18:27:03 +0800 Subject: [PATCH 19/28] update --- neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py index b73a69ed466..201172475c5 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py @@ -83,7 +83,8 @@ def quantize_params(mod, mod_extra_config): quantized_param = quantizer(param.to(cur_accelerator.name())) delattr(mod, param_name) setattr(mod, param_name, nn.Parameter(quantized_param)) - mod._updated_weight = True + # Note: in case of re-quantize the fp8 weights, we need to set `updated_fp8_weight` to True + mod.updated_fp8_weight = True quantized_param = getattr(mod, param_name) quantized_param.requires_grad_(False) cur_accelerator.synchronize() From fcf303148211225b6af435413cffa14457b365c6 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Sat, 5 Apr 2025 15:55:55 +0800 Subject: [PATCH 20/28] Update PatchedVLLMKVCache for deepseek performance (#2165) --- .../fp8_quant/_quant_common/helper_modules.py | 32 +++++++------------ 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py index a2cd3ac258f..5652327fd6e 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py @@ -1082,10 +1082,10 @@ def forward_qdq(self, input, *args, **kwargs): output_cache = self.orig_mod(qinput, *args, **kwargs) return output_cache - # def forward_quant(self, input, *args, **kwargs): - # qinput = self.quant_input(input) - # output_cache = self.orig_mod(qinput, *args, **kwargs) - # return self.dequant_output(output_cache) + def forward_quant(self, input, *args, **kwargs): + qinput = self.quant_input(input) + output_cache = self.orig_mod(qinput, *args, **kwargs) + return self.dequant_output(output_cache) def forward_measure(self, input, *args, **kwargs): measure_input((input, ), self._mod_extra_config.inputs) @@ -1093,22 +1093,12 @@ def forward_measure(self, input, *args, **kwargs): measure_output((output_cache, ), self._mod_extra_config.outputs) return output_cache - # def fetch_from_cache(self, cache, blocks, permutations=None): - # # quant_cache = self.quant_input(cache) - # quant_cache = cache - # if permutations: - # output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks, permutations) - # for i in range(len(output_cache)): - # output_cache[i] = self.dequant_output(output_cache[i]) - # return output_cache - # output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks) - # return self.dequant_output(output_cache) - - def forward_quant(self, input, *args, **kwargs): - qinput = self.quant_input(input) - return self.orig_mod(qinput, *args, **kwargs) - - def fetch_from_cache(self, quant_cache, blocks, permutations=None): + def fetch_from_cache(self, cache, blocks, permutations=None): + # TODO: Remove this workaround in next release [SW-221595] + if cache.dtype != self.lp_dtype: + quant_cache = self.quant_input(cache) + else: + quant_cache = cache if permutations: output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks, permutations) for i in range(len(output_cache)): @@ -1116,7 +1106,7 @@ def fetch_from_cache(self, quant_cache, blocks, permutations=None): return output_cache output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks) return self.dequant_output(output_cache) - + def extra_repr(self) -> str: return f"PatchedVLLMKVCache" From 39d1ccfbbf19797c0262432236fd311eb5752589 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 7 Apr 2025 04:15:17 +0300 Subject: [PATCH 21/28] align fused moe op Change-Id: I6a19a4d7492a221785901c464b8d8c8ccaedd898 Signed-off-by: Yi Liu --- .../algorithms/fp8_quant/_quant_common/helper_modules.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py index 5652327fd6e..73aed3eb05c 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py @@ -865,14 +865,14 @@ def forward_measure( x, topk_ids, topk_weights, - moe_n_slice, - n_expert_slice, - ep_shift, + moe_n_slice=None, + n_expert_slice=None, + ep_shift=None, ): hidden_states = x measure_input((hidden_states,), observer=self._mod_extra_config.inputs) # FIXME: (Yi) Assume moe_n_slice is 1, remove it? - assert moe_n_slice == 1, f"moe_n_slice is {moe_n_slice}, expected 1" + # assert moe_n_slice == 1, f"moe_n_slice is {moe_n_slice}, expected 1" min_expert = self.experts_min max_expert = self.experts_max w13_list_slice = [] From c3c60a9d602f7640430828502d3e2ef1e0d7f39f Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 7 Apr 2025 06:57:55 +0300 Subject: [PATCH 22/28] fix moe op Change-Id: I94138049a5609accae94aeec40c9b82382eb705d --- neural_compressor/torch/algorithms/fp8_quant/_core/measure.py | 3 +++ .../algorithms/fp8_quant/_quant_common/helper_modules.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py index 36ea0616f94..1c1e6fefc7e 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py @@ -149,6 +149,9 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N logger.info(f"Patching measure module {name} {mod.__class__}") num_info += 1 set_hqt_config(mod, top_level_config) # set config in the module, as it consumed by the patched module + # override default number of outputs for dynamic moe + mod_types[mod_type].num_outputs = mod.num_experts+1 + logger.warning(f"Dynamic moe num_outputs set to {mod.num_experts+1}") mod_extra_config = ( init_measure_object( mod, diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py index 73aed3eb05c..b80b6e8a3e5 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py @@ -907,8 +907,8 @@ def forward_quant( x, topk_ids, topk_weights, - moe_n_slice, - n_expert_slice, + moe_n_slice=None, + n_expert_slice=None, ep_shift=None, ): hidden_states = x From ccb90e5d7144c2111186710cdea289aa9f9b3fb9 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 7 Apr 2025 07:07:12 +0300 Subject: [PATCH 23/28] fix moe op num_outputs Change-Id: Ic3e689f026497908f409d9ffc2a69971fe36d0cc Signed-off-by: Yi Liu --- .../torch/algorithms/fp8_quant/_core/measure.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py index 1c1e6fefc7e..ef9bdc1c7b4 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py @@ -149,9 +149,10 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N logger.info(f"Patching measure module {name} {mod.__class__}") num_info += 1 set_hqt_config(mod, top_level_config) # set config in the module, as it consumed by the patched module - # override default number of outputs for dynamic moe - mod_types[mod_type].num_outputs = mod.num_experts+1 - logger.warning(f"Dynamic moe num_outputs set to {mod.num_experts+1}") + if mod_type == "dynamic_moe" and hasattr(mod, "num_experts"): + # override default number of outputs for dynamic moe + mod_types[mod_type].num_outputs = mod.num_experts+1 + logger.warning(f"Dynamic moe num_outputs set to {mod.num_experts+1}") mod_extra_config = ( init_measure_object( mod, From 56de6185b59fe6697df4a8715a4e565361cc8750 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 9 Apr 2025 16:20:29 +0800 Subject: [PATCH 24/28] dequant for pts as well (#2167) Change-Id: Ic1b11c25fa7402ab0b4e2c0cca0939a89d20ffa2 Signed-off-by: Yi Liu Co-authored-by: Yi Liu --- .../fp8_quant/_core/scale_methods/ops_quantizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py index fe23a92a6c3..697001ff818 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py @@ -95,10 +95,11 @@ def get_scales_module_config(self): input_scales = self.calc_input_scales(num_of_inputs=1) output_measurement = self.measurement.outputs[0] if self.measurement is not None else [] rescaled_weight = self.mod.weight if hasattr(self.mod, 'weight') else None + if rescaled_weight is not None: + rescaled_weight = maybe_dequant_original_fp8_weight(self.mod, rescaled_weight) if self.weight_ich_scale_calc is not None: weight_scales_in_ch = self.weight_ich_scale_calc.calc_scales(input_scales[0], QuantTensorType.CONST) - bf16_weight = maybe_dequant_original_fp8_weight(self.mod, self.mod.weight) - rescaled_weight = torch.div(bf16_weight, weight_scales_in_ch.reshape([1, -1])) + rescaled_weight = torch.div(rescaled_weight, weight_scales_in_ch.reshape([1, -1])) weights_scales_out_ch = self.weight_och_scale_calc.calc_scales(rescaled_weight, QuantTensorType.CONST) params_config = ( {"weight": weights_scales_out_ch} From e340ed95de92f74b1d135c3b4fc5b4f313c06772 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Sat, 12 Apr 2025 20:29:46 +0800 Subject: [PATCH 25/28] correct linear measure (#2170) Signed-off-by: Yi --- .../fp8_quant/_quant_common/helper_modules.py | 42 ++++++++----------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py index b80b6e8a3e5..182f14311fc 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py @@ -416,22 +416,14 @@ def forward_quant(self, input): return self.post_all_reduce(dqoutput) def forward_measure(self, input): - # if torch.distributed.get_rank() == 0: - # import pdb; pdb.set_trace() - # torch.distributed.barrier() - # resolved_input = self.resolve_input(input) - # measure_input((resolved_input,), observer=self._mod_extra_config.inputs) + resolved_input = self.resolve_input(input) + measure_input((resolved_input,), observer=self._mod_extra_config.inputs) # output = torch.matmul(resolved_input, self.weight.transpose(-1, -2)) - # measure_output((output,), self._mod_extra_config.outputs) - # if self.reduce_results: - # output = self.collective_func(output) - # return self.post_all_reduce(output) - - # FIXME: It is not fully correct here - measure_input((input,), observer=self._mod_extra_config.inputs) - output, output_bias = self.orig_mod(input) + output = self.orig_mod.quant_method.apply(self.orig_mod, resolved_input) measure_output((output,), self._mod_extra_config.outputs) - return output, output_bias + if self.reduce_results: + output = self.collective_func(output) + return self.post_all_reduce(output) def post_all_reduce(self, output): assert ( @@ -482,21 +474,21 @@ def forward_quant(self, input): return self.post_all_reduce(dqoutput) def forward_measure(self, input): - # if torch.distributed.get_rank() == 0: - # import pdb; pdb.set_trace() measure_input((input,), observer=self._mod_extra_config.inputs) - # FIXME: it is not fully correct here, - output, output_bias = self.orig_mod(input) + output = self.orig_mod.quant_method.apply(self.orig_mod, input) measure_output((output,), self._mod_extra_config.outputs) + output, output_bias = self.add_bias(output) + if self.gather_output: + output = self.collective_func(output) return output, output_bias - # output = torch.matmul(input, self.weight.transpose(-1, -2)) - - # output = torch.matmul(input, self.weight.transpose(-1, -2)) - # measure_output((output,), self._mod_extra_config.outputs) - # if self.gather_output: - # output = self.collective_func(output) - # return self.post_all_reduce(output) + def add_bias(self, output): + if not self.skip_bias_add: + output = output + self.bias if self.bias is not None else output + output_bias = None + else: + output_bias = self.bias + return output, output_bias def post_all_reduce(self, output): if not self.skip_bias_add: From 887787eec19ae6b8c457ceb7de301195e884fac9 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 16 May 2025 08:28:39 +0800 Subject: [PATCH 26/28] add INC_MEASUREMENT_DUMP_PATH_PREFIX (#2210) Signed-off-by: yiliu30 --- .../fp8_quant/_quant_common/quant_config.py | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py index e4ebdefaa11..ad334f8bfbc 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py @@ -24,7 +24,7 @@ import torch from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator, INCAcceleratorType -from ..utils.logger import logger +from neural_compressor.torch.utils import logger try: world_size = torch.distributed.get_world_size() @@ -225,6 +225,25 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg: else: measured_global_config[keys] = custom_config[keys] + INC_MEASUREMENT_DUMP_PATH_PREFIX = os.getenv("INC_MEASUREMENT_DUMP_PATH_PREFIX", None) + if INC_MEASUREMENT_DUMP_PATH_PREFIX is not None: + dump_stats_path = os.path.join(INC_MEASUREMENT_DUMP_PATH_PREFIX, measured_global_config["dump_stats_path"]) + measured_global_config["dump_stats_path"] = dump_stats_path + logger.info( + f"INC_MEASUREMENT_DUMP_PATH_PREFIX is set to {INC_MEASUREMENT_DUMP_PATH_PREFIX}, dump_stats_path is set to {dump_stats_path}" + ) + # check if the directory exists + + dir_path = os.path.dirname(measured_global_config["dump_stats_path"]) + if not os.path.exists(dir_path): + raise ValueError( + ( + f"The measurement dump directory '{dir_path}' does not exist," + f" the path is determined by the environment variable INC_MEASUREMENT_DUMP_PATH_PREFIX" + f" and the dump_stats_path in the quantization config file." + ) + ) + # If seperate_measure_files is True (default value), then it is assumed that there are multiple distinct measure and scale files # and they are stored in / loaded from paths with the correct index as a suffix. Else, only one is searched for. measured_global_config["local_rank"] = ( From 971a7e55b017084ae3b4e4770ef4c678eb2b0f9a Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 16 May 2025 15:51:39 +0800 Subject: [PATCH 27/28] fix path check (#2212) Signed-off-by: yiliu30 --- .../torch/algorithms/fp8_quant/_quant_common/quant_config.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py index ad334f8bfbc..88a703e7c6f 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py @@ -233,9 +233,10 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg: f"INC_MEASUREMENT_DUMP_PATH_PREFIX is set to {INC_MEASUREMENT_DUMP_PATH_PREFIX}, dump_stats_path is set to {dump_stats_path}" ) # check if the directory exists - + dir_path = os.path.dirname(measured_global_config["dump_stats_path"]) - if not os.path.exists(dir_path): + abs_path = os.path.abspath(dir_path) + if not (os.path.exists(dir_path) or os.path.exists(abs_path)): raise ValueError( ( f"The measurement dump directory '{dir_path}' does not exist," From 0bd4390d63153a59914fcf1d5735f9b3d6aae11e Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 16 May 2025 16:26:57 +0800 Subject: [PATCH 28/28] Naive Scaling (#2211) * add INC_MEASUREMENT_DUMP_PATH_PREFIX Signed-off-by: yiliu30 * add naive scaling Signed-off-by: yi * Update neural_compressor/torch/utils/environ.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Signed-off-by: yiliu30 Signed-off-by: yi Co-authored-by: yi Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../torch/algorithms/fp8_quant/_core/fp_utils.py | 6 ++++++ .../fp8_quant/_core/scale_methods/scale_method_factory.py | 6 +++++- neural_compressor/torch/utils/environ.py | 4 ++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py b/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py index 0ea9492e6f3..5f229ea8a60 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py @@ -17,6 +17,9 @@ from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator, INCAcceleratorType cur_accelerator = auto_detect_accelerator() +from neural_compressor.torch.utils import environ +from neural_compressor.common.utils import logger + descale_fcn = lambda x, scale: torch.mul(x, scale) scale_fcn = lambda x, scale: torch.div(x, scale) cast_fcn = lambda x, dtype: x.to(dtype=dtype) @@ -106,6 +109,9 @@ def get_fp8_hw_alligned_scales(dtype, device): } def calc_maxabs_scale(xmaxabs, fullscale, backoff=1): + if environ.INC_FORCE_NAIVE_SCALING: + backoff = 1.0 + logger.warning_once(f"Enabled naive scaling, backoff is set to {backoff}") scale = xmaxabs / (fullscale * backoff) return scale diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scale_method_factory.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scale_method_factory.py index b1517c712b1..91d97fb6fda 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scale_method_factory.py +++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scale_method_factory.py @@ -16,7 +16,8 @@ from .round_scales_function import * from ..common import get_device_type_for_scales from .scales_method import * - +from neural_compressor.torch.utils import environ +from neural_compressor.common.utils import logger class QuantTensorName(Enum): INPUT = auto() @@ -40,6 +41,9 @@ class ScaleValueType(Enum): def parse_rounding_method(config, device_for_scales): round_method = ScaleIdentity() + if environ.INC_FORCE_NAIVE_SCALING: + logger.warning_once("Enabled naive scaling") + return round_method if "single" in config and "hw" in config: round_method = ScaleHwAlignedFixed(device_for_scales) elif "unit" in config: diff --git a/neural_compressor/torch/utils/environ.py b/neural_compressor/torch/utils/environ.py index 623d3446232..7c40af92710 100644 --- a/neural_compressor/torch/utils/environ.py +++ b/neural_compressor/torch/utils/environ.py @@ -32,6 +32,10 @@ world_size = int(os.getenv("WORLD_SIZE", "-1")) + +INC_FORCE_NAIVE_SCALING = os.getenv("INC_FORCE_NAIVE_SCALING", "0").lower() in ["1", "true"] + + ################ Check imported sys.module first to decide behavior ################# def is_ipex_imported() -> bool: """Check whether intel_extension_for_pytorch is imported."""