From 730bf0cb3c12dc43df89169f96644e4ca4735288 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Thu, 20 Mar 2025 10:28:54 +0200
Subject: [PATCH 01/30] fix patched mod wip

Change-Id: Ibb92d77e312394736cb548f60f7039fd9944adce
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 .../algorithms/fp8_quant/_core/common.py      | 14 ++--
 .../algorithms/fp8_quant/_core/measure.py     |  2 +
 .../fp8_quant/_quant_common/helper_modules.py | 67 ++++++++++++-------
 3 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
index 11992ec8d63..775fb690061 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -224,13 +224,13 @@ def convert_scales_to_tensors_dict(scales_obj, scales_file_format, hp_dtype, dev
     "MoeMatmul": ModuleInfo("linear", PatchedMoeMatmul),
     "ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear),
     # FIXME (Yi) revert change
-    "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False),
-    "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock),
-    "VllmMixtureOfExpertsOp": (
-        ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV2)
-        if os.getenv("LOW_CPU_MEM", "0") == "1"
-        else ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV1)
-    ),
+    # "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False),
+    # "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock),
+    # "VllmMixtureOfExpertsOp": (
+    #     ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV2)
+    #     if os.getenv("LOW_CPU_MEM", "0") == "1"
+    #     else ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV1)
+    # ),
 }
 
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
index 9b1db9d0906..ee620046946 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
@@ -167,6 +167,8 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N
                 # logger.info(f"Pacthed module pmod: {pmod}")
                 if pmod._mod_extra_config:
                     for param_name in pmod._mod_extra_config.params:
+                        # if torch.distributed.get_rank() == 0:
+                        #     import pdb; pdb.set_trace()
                         param = getattr(pmod, param_name)
                         if config["measure_on_hpu"]:
                             param = param.to(cur_accelerator.name())
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
index 245b3f08a16..b7b0bf4125c 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -416,13 +416,22 @@ def forward_quant(self, input):
         return self.post_all_reduce(dqoutput)
 
     def forward_measure(self, input):
-        resolved_input = self.resolve_input(input)
-        measure_input((resolved_input,), observer=self._mod_extra_config.inputs)
-        output = torch.matmul(resolved_input, self.weight.transpose(-1, -2))
+        # if torch.distributed.get_rank() == 0:
+        #     import pdb; pdb.set_trace()
+        # torch.distributed.barrier()
+        # resolved_input = self.resolve_input(input)
+        # measure_input((resolved_input,), observer=self._mod_extra_config.inputs)
+        # output = torch.matmul(resolved_input, self.weight.transpose(-1, -2))
+        # measure_output((output,), self._mod_extra_config.outputs)
+        # if self.reduce_results:
+        #     output = self.collective_func(output)
+        # return self.post_all_reduce(output)
+
+        # FIXME: It is not fully correct here
+        measure_input((input,), observer=self._mod_extra_config.inputs)
+        output, output_bias = self.orig_mod(input)
         measure_output((output,), self._mod_extra_config.outputs)
-        if self.reduce_results:
-            output = self.collective_func(output)
-        return self.post_all_reduce(output)
+        return output, output_bias
 
     def post_all_reduce(self, output):
         assert (
@@ -473,12 +482,21 @@ def forward_quant(self, input):
         return self.post_all_reduce(dqoutput)
 
     def forward_measure(self, input):
+        # if torch.distributed.get_rank() == 0:
+        #     import pdb; pdb.set_trace()
         measure_input((input,), observer=self._mod_extra_config.inputs)
-        output = torch.matmul(input, self.weight.transpose(-1, -2))
+        # FIXME: it is not fully correct here, 
+        output, output_bias = self.orig_mod(input)
         measure_output((output,), self._mod_extra_config.outputs)
-        if self.gather_output:
-            output = self.collective_func(output)
-        return self.post_all_reduce(output)
+        return output, output_bias
+
+        # output = torch.matmul(input, self.weight.transpose(-1, -2))
+
+        # output = torch.matmul(input, self.weight.transpose(-1, -2))
+        # measure_output((output,), self._mod_extra_config.outputs)
+        # if self.gather_output:
+        #     output = self.collective_func(output)
+        # return self.post_all_reduce(output)
 
     def post_all_reduce(self, output):
         if not self.skip_bias_add:
@@ -576,18 +594,19 @@ def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
         super().__init__(mod, parent, mod_extra_config, *args, **kwargs)
         # remove the MoE weights that are quanted by PatchedMoeMatmul
         if self.quantization_mode in [QuantMode.QUANTIZE, QuantMode.LOAD]:
-            if hasattr(mod, "w13_weight"):
-                delattr(mod, "w13_weight")
-                setattr(mod, "w13_weight", None)
-            if hasattr(mod, "w2_weight"):
-                delattr(mod, "w2_weight")
-                setattr(self, "w2_weight", None)
-            if hasattr(mod, "w1_weight"):
-                delattr(mod, "w1_weight")
-                setattr(self, "w1_weight", None)
-            if hasattr(mod, "w3_weight"):
-                delattr(mod, "w3_weight")
-                setattr(self, "w3_weight", None)
+            pass
+            # if hasattr(mod, "w13_weight"):
+            #     delattr(mod, "w13_weight")
+            #     setattr(mod, "w13_weight", None)
+            # if hasattr(mod, "w2_weight"):
+            #     delattr(mod, "w2_weight")
+            #     setattr(self, "w2_weight", None)
+            # if hasattr(mod, "w1_weight"):
+            #     delattr(mod, "w1_weight")
+            #     setattr(self, "w1_weight", None)
+            # if hasattr(mod, "w3_weight"):
+            #     delattr(mod, "w3_weight")
+            #     setattr(self, "w3_weight", None)
         self.forward = self.forward_orig
 
 
@@ -724,8 +743,8 @@ def extra_repr(self) -> str:
 class PatchedVllmMixtureOfExpertsOpV1(PatchedModuleBase):
     def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
         super().__init__(mod, parent, mod_extra_config, *args, **kwargs)
-        self.experts_min = self.orig_mod.experts_min
-        self.experts_max = self.orig_mod.experts_max
+        self.experts_min = self.orig_mod.experts_min if hasattr(self.orig_mod, "experts_min") else 0
+        self.experts_max = self.orig_mod.experts_max if hasattr(self.orig_mod, "experts_max") else 7
         if self.quantization_mode in [QuantMode.QUANTIZE, QuantMode.LOAD]:
             self.forward = self.forward_quant
             self.dynamic_moe_op = get_quantized_func_wrapper(OP_TYPE.DYNAMIC_MOE_FUSED_WEIGHTS, self.scale_format)

From fbd59c2d9079f2791c11936c8dfeec6b510779b1 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Thu, 20 Mar 2025 12:40:32 +0200
Subject: [PATCH 02/30] dequant fp8 if needed

Change-Id: If65d0253fa8bbc926b5a33b75c2946fc51e49272
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 .../torch/algorithms/fp8_quant/_core/common.py            | 8 ++++++++
 .../torch/algorithms/fp8_quant/_core/measure.py           | 3 ++-
 .../torch/algorithms/fp8_quant/_core/quantize.py          | 3 ++-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
index 775fb690061..49a067b9305 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -42,6 +42,14 @@
 
 INFO_INTERVAL = 30 # seconds
 
+def maybe_dequant_original_fp8_weight(mod: torch.nn.Module, param: torch.Tensor):
+    if param.dtype in [torch.float8_e4m3fn]:
+        if hasattr(mod, "get_post_process_weights_func"):
+            post_process_weights_func = mod.get_post_process_weights_func()
+            if post_process_weights_func is not None:
+                param = post_process_weights_func(mod)
+    return param
+
 _mod_types = {
     "linear": ModuleType(1, ["weight"], 1, False),
     "matmul": ModuleType(2, [], 1, False),
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
index ee620046946..36ea0616f94 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
@@ -22,7 +22,7 @@
 import time
 from .._quant_common.quant_config import MeasureExclude, QuantMode, ScaleMethod, get_hqt_config, set_hqt_config
 # from ..utils.logger import logger
-from neural_compressor.torch.algorithms.fp8_quant._core.common import INFO_INTERVAL
+from neural_compressor.torch.algorithms.fp8_quant._core.common import INFO_INTERVAL, maybe_dequant_original_fp8_weight
 from .common import *
 from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator
 from neural_compressor.torch.algorithms.fp8_quant.model_configs import (
@@ -170,6 +170,7 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N
                         # if torch.distributed.get_rank() == 0:
                         #     import pdb; pdb.set_trace()
                         param = getattr(pmod, param_name)
+                        param = maybe_dequant_original_fp8_weight(pmod.orig_mod, param)
                         if config["measure_on_hpu"]:
                             param = param.to(cur_accelerator.name())
                         pmod._mod_extra_config.params[param_name].measure(param)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
index 83ae9f7ea27..31e41647955 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -33,7 +33,7 @@
 import time
 cur_accelerator = auto_detect_accelerator()
 
-from neural_compressor.torch.algorithms.fp8_quant._core.common import INFO_INTERVAL
+from neural_compressor.torch.algorithms.fp8_quant._core.common import INFO_INTERVAL, maybe_dequant_original_fp8_weight
 
 
 @torch.no_grad()
@@ -78,6 +78,7 @@ def quantize_params(mod, mod_extra_config):
         param = getattr(mod, param_name)
         if param.dtype == torch.float16:
             param = param.to(torch.bfloat16)
+        param = maybe_dequant_original_fp8_weight(mod, param)
         quantized_param = quantizer(param.to(cur_accelerator.name()))
         delattr(mod, param_name)
         setattr(mod, param_name, nn.Parameter(quantized_param))

From 8328996a322556586d92c553ea07971435c07bff Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Fri, 21 Mar 2025 09:22:06 +0200
Subject: [PATCH 03/30] add patched mod for fp8 ds

Change-Id: I3e6893be05ba36eb67e367a775fde5ee93d3fa7e
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 neural_compressor/common/utils/__init__.py    |   2 +-
 .../algorithms/fp8_quant/_core/common.py      |   2 +
 .../algorithms/fp8_quant/_core/quantize.py    |   9 +-
 .../fp8_quant/_quant_common/helper_modules.py | 116 +++++++++++++++++-
 4 files changed, 121 insertions(+), 8 deletions(-)

diff --git a/neural_compressor/common/utils/__init__.py b/neural_compressor/common/utils/__init__.py
index 5b4a8043ff1..93133e30571 100644
--- a/neural_compressor/common/utils/__init__.py
+++ b/neural_compressor/common/utils/__init__.py
@@ -28,7 +28,7 @@
 VLLM_TP_SIZE = int(os.getenv("VLLM_TP_SIZE", "8"))
 VLLM_EP_SIZE = int(os.getenv("VLLM_EP_SIZE", VLLM_TP_SIZE))
 NUM_EXPERTS_PER_EP_RANK = DEEPSEEK_EXPERTS // VLLM_EP_SIZE  # 32
-NUM_EXPERTS_GROUPS = 8
+NUM_EXPERTS_GROUPS = int(os.getenv("NUM_EXPERTS_GROUPS", 8))
 NUM_EXPERTS_PER_GROUP_PER_RANK = NUM_EXPERTS_PER_EP_RANK // NUM_EXPERTS_GROUPS # 4
 FUSED_MOE_EXPERTS = NUM_EXPERTS_PER_GROUP_PER_RANK  # 4
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
index 49a067b9305..d127e5c6a3f 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -230,7 +230,9 @@ def convert_scales_to_tensors_dict(scales_obj, scales_file_format, hp_dtype, dev
     "Softmax": ModuleInfo("softmax", PatchedSoftmax),
     "ModuleFusedSDPA": ModuleInfo("fused_sdpa", PatchedModuleFusedSDPA),
     "MoeMatmul": ModuleInfo("linear", PatchedMoeMatmul),
+    "MoeFP8Matmul": ModuleInfo("linear", PatchedMoeFP8Matmul),
     "ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear),
+    "DynamicMoeRuntimeDequantFP8": ModuleInfo("dynamic_moe", PatchedDynamicMoeRuntimeDequantFP8),
     # FIXME (Yi) revert change
     # "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False),
     # "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock),
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
index 31e41647955..f6b4ff87cf8 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -180,14 +180,17 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name,
     logger.debug("Patched modules: %s", patched_modules)
     logger.debug("Total patched modules: %d", len(patched_modules))
     model = model.to(cur_accelerator.name())
-    for _, mod in model.named_modules():
-        if hasattr(mod, "post_process"):
-            mod.post_process()
+    postporcess_after_convert_(model)
     torch.distributed.barrier()
     convert_fp16_to_bf16(model)
     cur_accelerator.synchronize()
 
 
+def postporcess_after_convert_(model):
+    for _, mod in model.named_modules():
+        if hasattr(mod, "post_process"):
+            mod.post_process()
+
 def prepare_model_with_dummy_measurement(model, mod_list, scaling_method_name, scale_config):
     """Aim for loading, replace module with patched module for model on meta device.
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
index b7b0bf4125c..31eea4b9994 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -651,7 +651,16 @@ def extra_repr(self) -> str:
             get_current_repr(self, "scale_input", "scale_weight"),
         )
 
-
+class PatchedMoeFP8Matmul(PatchedMoeMatmul):
+    def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
+        super().__init__(mod, parent, mod_extra_config, *args, **kwargs)
+        # if torch.distributed.get_rank() == 0:
+        #     import pdb; pdb.set_trace()
+        # torch.distributed.barrier()
+        # self.block_size = self.orig_mod.block_size
+        # self.scale_inv_fp8 = self.orig_mod.scale_inv_fp8
+        self.get_dequant_weight = self.orig_mod.get_dequant_weight
+    
 class PatchedGaudiMixtralSparseMoeBlock(PatchedModuleBase):
     def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
         super().__init__(mod, parent, mod_extra_config, *args, **kwargs)
@@ -756,11 +765,18 @@ def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
                 [mod_extra_config.scale.inputs[x] for x in range(1, self.num_experts+1)],
                 self.scale_format,
             )
-            for i in range(self.num_experts):
-                self.w13_list[i].weight = self.w13_list[i].weight.squeeze().t().contiguous()
-                self.w2_list[i].weight = self.w2_list[i].weight.squeeze().t().contiguous()
+            # if torch.distributed.get_rank() == 0:
+            #     import pdb; pdb.set_trace()
+            # torch.distributed.barrier()
+            self._post_init_for_quant()
+
         elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
             self.forward = self.forward_measure
+    
+    def _post_init_for_quant(self):
+        for i in range(self.num_experts):
+            self.w13_list[i].weight = self.w13_list[i].weight.squeeze().t().contiguous()
+            self.w2_list[i].weight = self.w2_list[i].weight.squeeze().t().contiguous()
 
     def forward_quant(self,
                       hidden_states,
@@ -832,6 +848,98 @@ def extra_repr(self) -> str:
             f"quant_mode:{quant_mode}, {get_current_repr(self, *member_names)}",
         )
 
+class PatchedDynamicMoeRuntimeDequantFP8(PatchedVllmMixtureOfExpertsOpV1):
+    def _post_init_for_quant(self):
+        pass
+
+    def post_process(self):
+        # if torch.distributed.get_rank() == 0:
+        #     import pdb; pdb.set_trace()
+        # torch.distributed.barrier()
+        for i in range(self.num_experts):
+            self.w13_list[i].weight = torch.nn.Parameter(self.w13_list[i].weight.squeeze().t().contiguous())
+            self.w2_list[i].weight = torch.nn.Parameter(self.w2_list[i].weight.squeeze().t().contiguous())
+
+    def forward_measure(
+        self,
+        x,
+        topk_ids,
+        topk_weights,
+        moe_n_slice,
+        n_expert_slice,
+        ep_shift,
+    ):
+        hidden_states = x
+        measure_input((hidden_states,), observer=self._mod_extra_config.inputs)
+        # Assume moe_n_slice is 1
+        assert moe_n_slice == 1, f"moe_n_slice is {moe_n_slice}, expected 1"
+        i = 0
+        # for i in range(moe_n_slice):
+        min_expert = i * n_expert_slice
+        max_expert = (i + 1) * n_expert_slice
+        w13_list_slice = []
+        w2_list_slice = []
+        for j in range(min_expert, max_expert):
+            w13_list_slice.append(self.w13_list[j].get_dequant_weight())
+            w2_list_slice.append(self.w2_list[j].get_dequant_weight())
+
+        output, intermidiate_amax = torch.ops.hpu.mixture_of_experts.fp8_measurement_fused_weights(
+            hidden_states=x,
+            expert_routing_table=topk_ids.to(torch.int64),
+            router_weights=topk_weights.to(x.dtype),
+            w12=w13_list_slice,
+            w3=w2_list_slice,
+            permuted_weights=True,
+            activation="silu",
+            experts_min=min_expert + ep_shift,
+            experts_max=max_expert - 1 + ep_shift,
+            measurement_mode=True,  # <=============
+        )
+        output_measure_list = [output]
+        # if torch.distributed.get_rank() == 0:
+        #     import pdb; pdb.set_trace()
+        # torch.distributed.barrier()
+        for i in range(self.num_experts):
+            output_measure_list.append(intermidiate_amax[i])
+        measure_output(output_measure_list, self._mod_extra_config.outputs)
+        return output
+
+    def forward_quant(
+        self,
+        x,
+        topk_ids,
+        topk_weights,
+        moe_n_slice,
+        n_expert_slice,
+        ep_shift=None,
+    ):
+        hidden_states = x
+        expert_routing_table = topk_ids.to(torch.int64)
+        router_weights = topk_weights.to(x.dtype)
+        permuted_weights = True
+        activation = "silu"
+        experts_range = range(self.num_experts)
+        w1_list = [self.w13_list[i].weight for i in experts_range]
+        w2_list = [self.w2_list[i].weight for i in experts_range]
+        scale_w1 = [self.w13_list[i].scale_weight for i in experts_range]
+        scale_w2 = [self.w2_list[i].scale_weight for i in experts_range]
+        qinput = self.quant_input(hidden_states)
+        output = self.dynamic_moe_op(
+            hidden_states=qinput,
+            expert_routing_table=expert_routing_table,
+            router_weights=router_weights,
+            w12=w1_list,
+            w3=w2_list,
+            d_scale_w12=scale_w1,
+            d_scale_w3=scale_w2,
+            d_scale_hidden_states=self.scale_input,
+            d_scale_intermediate_hidden_states=self.scale_intermediate,
+            permuted_weights=permuted_weights,
+            activation=activation,
+            experts_min=self.experts_min,
+            experts_max=self.experts_max,
+        )
+        return output
 
 class PatchedVllmMixtureOfExpertsOpV2(PatchedVllmMixtureOfExpertsOpV1):
     def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):

From eb9e4ed089f70757b38c0d46fb03970b1a243cda Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Fri, 21 Mar 2025 12:22:24 +0200
Subject: [PATCH 04/30] add fused moe back

Change-Id: I5d3b6fa90b2d19fc928791bb9d17b188851f230e
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 .../algorithms/fp8_quant/_core/common.py      |  2 +-
 .../algorithms/fp8_quant/_core/quantize.py    |  1 +
 .../fp8_quant/_quant_common/helper_modules.py | 25 +++++++++----------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
index d127e5c6a3f..fda18e9c2d6 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -234,7 +234,7 @@ def convert_scales_to_tensors_dict(scales_obj, scales_file_format, hp_dtype, dev
     "ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear),
     "DynamicMoeRuntimeDequantFP8": ModuleInfo("dynamic_moe", PatchedDynamicMoeRuntimeDequantFP8),
     # FIXME (Yi) revert change
-    # "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False),
+    "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False),
     # "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock),
     # "VllmMixtureOfExpertsOp": (
     #     ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpV2)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
index f6b4ff87cf8..912ec5f504a 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -78,6 +78,7 @@ def quantize_params(mod, mod_extra_config):
         param = getattr(mod, param_name)
         if param.dtype == torch.float16:
             param = param.to(torch.bfloat16)
+        logger.debug(f"Quantizing parameter {param_name} of module {mod.__class__.__name__}")
         param = maybe_dequant_original_fp8_weight(mod, param)
         quantized_param = quantizer(param.to(cur_accelerator.name()))
         delattr(mod, param_name)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
index 31eea4b9994..07dc0b68f5e 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -594,19 +594,18 @@ def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
         super().__init__(mod, parent, mod_extra_config, *args, **kwargs)
         # remove the MoE weights that are quanted by PatchedMoeMatmul
         if self.quantization_mode in [QuantMode.QUANTIZE, QuantMode.LOAD]:
-            pass
-            # if hasattr(mod, "w13_weight"):
-            #     delattr(mod, "w13_weight")
-            #     setattr(mod, "w13_weight", None)
-            # if hasattr(mod, "w2_weight"):
-            #     delattr(mod, "w2_weight")
-            #     setattr(self, "w2_weight", None)
-            # if hasattr(mod, "w1_weight"):
-            #     delattr(mod, "w1_weight")
-            #     setattr(self, "w1_weight", None)
-            # if hasattr(mod, "w3_weight"):
-            #     delattr(mod, "w3_weight")
-            #     setattr(self, "w3_weight", None)
+            if hasattr(mod, "w13_weight"):
+                delattr(mod, "w13_weight")
+                setattr(mod, "w13_weight", None)
+            if hasattr(mod, "w2_weight"):
+                delattr(mod, "w2_weight")
+                setattr(self, "w2_weight", None)
+            if hasattr(mod, "w1_weight"):
+                delattr(mod, "w1_weight")
+                setattr(self, "w1_weight", None)
+            if hasattr(mod, "w3_weight"):
+                delattr(mod, "w3_weight")
+                setattr(self, "w3_weight", None)
         self.forward = self.forward_orig
 
 

From 204bd992e84f5d8bc4fdbcc078bdebe9fc6e5ff5 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Fri, 21 Mar 2025 13:28:43 +0200
Subject: [PATCH 05/30] show mem after patch

Change-Id: Idf7dbfecd6a87dd8c48fad8fc7d31a689ddb4fbd
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
index 912ec5f504a..e47f3501213 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -174,6 +174,9 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name,
                 patched_module_types.add(type(mod))
                 htcore.mark_step()
                 logger.debug("Patched module name: %s", name)
+            cur_accelerator.synchronize()
+            logger.info("Patched module name: %s", name)
+            show_mem_info()
     if save_file: # cache calculated scales
         save_scales(model, scales_obj, scales_file_format, scale_file + ".npz")
         save_scales(model, scales_obj, scales_file_format, scale_file + ".json")

From 18f51b22414e05f54b98ffb01b2eccb8aa4788f3 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Fri, 21 Mar 2025 15:39:40 +0200
Subject: [PATCH 06/30] debug

Change-Id: Ic6954975f8bb1a1507acb5e86807d8fdd21f39dc
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 .../algorithms/fp8_quant/_core/quantize.py    | 23 +++++++++++++++++++
 .../fp8_quant/_quant_common/helper_modules.py |  3 +++
 neural_compressor/torch/utils/environ.py      | 11 +++++----
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
index e47f3501213..9fc60f26102 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -183,11 +183,34 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name,
     logger.debug("Patched module types: %s", patched_module_types)
     logger.debug("Patched modules: %s", patched_modules)
     logger.debug("Total patched modules: %d", len(patched_modules))
+    
+    
+    def inspect_tensor(tensor, msg=""):
+        
+        if "cpu" in str(tensor.device):
+            logger.info(f"{msg}: tensor dtype: {tensor.dtype}, tensor shape: {tensor.shape}, deveice: {tensor.device}")
+            # if torch.distributed.get_rank() == 0:
+            #     import pdb; pdb.set_trace()
+            # torch.distributed.barrier()
+            
+    for pname, param in model.named_parameters():
+        inspect_tensor(param, pname)
+    # check buffer
+    for bname, buffer in model.named_buffers():
+        inspect_tensor(buffer, bname)
+    show_mem_info("before move all")
     model = model.to(cur_accelerator.name())
+    show_mem_info("after move all")
     postporcess_after_convert_(model)
+    show_mem_info("after post process")
     torch.distributed.barrier()
     convert_fp16_to_bf16(model)
+    show_mem_info("after convert_fp16_to_bf16")
     cur_accelerator.synchronize()
+    show_mem_info("after synchronize")
+    if torch.distributed.get_rank() == 0:
+        import pdb; pdb.set_trace()
+    torch.distributed.barrier()
 
 
 def postporcess_after_convert_(model):
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
index 07dc0b68f5e..a0453253f95 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -917,6 +917,9 @@ def forward_quant(
         router_weights = topk_weights.to(x.dtype)
         permuted_weights = True
         activation = "silu"
+        if torch.distributed.get_rank() == 0:
+            import pdb; pdb.set_trace()
+        torch.distributed.barrier()
         experts_range = range(self.num_experts)
         w1_list = [self.w13_list[i].weight for i in experts_range]
         w2_list = [self.w2_list[i].weight for i in experts_range]
diff --git a/neural_compressor/torch/utils/environ.py b/neural_compressor/torch/utils/environ.py
index e60dcc7ad88..623d3446232 100644
--- a/neural_compressor/torch/utils/environ.py
+++ b/neural_compressor/torch/utils/environ.py
@@ -235,15 +235,18 @@ def is_tbb_available():  # pragma: no cover
         return False
     return True
 
-def show_mem_info(loglevel="info"):
+def show_mem_info(msg="", loglevel="info"):
     hpu_mem_mb = get_used_hpu_mem_MB()
     from neural_compressor.common.utils import logger
     show_fn = getattr(logger, loglevel)
     rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else -1
-    show_fn(f"[Rank {rank}] Used HPU memory: {hpu_mem_mb // 1000} GB {hpu_mem_mb % 1000} MB")
+    # show_fn(f"[Rank {rank}] Used HPU memory: {hpu_mem_mb // 1000} GB {hpu_mem_mb % 1000} MB")
     cpu_mem_mb = get_used_cpu_mem_MB()
-    show_fn(f"[Rank {rank}] Used CPU memory: {cpu_mem_mb // 1000} GB {cpu_mem_mb % 1000} MB")
-    
+    # show_fn(f"[Rank {rank}] Used CPU memory: {cpu_mem_mb // 1000} GB {cpu_mem_mb % 1000} MB")
+    show_fn(
+        f"[Rank {rank}] {msg}, HPU: {hpu_mem_mb // 1000} GB {hpu_mem_mb % 1000:.2f} MB; CPU: {cpu_mem_mb // 1000} GB {cpu_mem_mb % 1000:.2f} MB"
+    )
+
 
 def get_used_hpu_mem_MB():
     """Get HPU used memory: MiB."""

From 472f0e27aa4c5fa7ff18a585c05f7edeca541a20 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Fri, 21 Mar 2025 15:40:25 +0200
Subject: [PATCH 07/30] update

Change-Id: Id225051fdede1b5e1def62083a7c5f93af5301cb
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
index 9fc60f26102..c2d5b1b0222 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -176,7 +176,7 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name,
                 logger.debug("Patched module name: %s", name)
             cur_accelerator.synchronize()
             logger.info("Patched module name: %s", name)
-            show_mem_info()
+            # show_mem_info()
     if save_file: # cache calculated scales
         save_scales(model, scales_obj, scales_file_format, scale_file + ".npz")
         save_scales(model, scales_obj, scales_file_format, scale_file + ".json")

From e25d65826976ed1c32fde575baecf6314fdce006 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Fri, 21 Mar 2025 15:46:39 +0200
Subject: [PATCH 08/30] sync for each tran

Change-Id: I8cdf01349ce58ecc608511d970cedf6b33ab3045
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 .../algorithms/fp8_quant/_quant_common/helper_modules.py   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
index a0453253f95..a16df85d5d4 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -858,6 +858,7 @@ def post_process(self):
         for i in range(self.num_experts):
             self.w13_list[i].weight = torch.nn.Parameter(self.w13_list[i].weight.squeeze().t().contiguous())
             self.w2_list[i].weight = torch.nn.Parameter(self.w2_list[i].weight.squeeze().t().contiguous())
+            htcore.mark_step()
 
     def forward_measure(
         self,
@@ -917,9 +918,9 @@ def forward_quant(
         router_weights = topk_weights.to(x.dtype)
         permuted_weights = True
         activation = "silu"
-        if torch.distributed.get_rank() == 0:
-            import pdb; pdb.set_trace()
-        torch.distributed.barrier()
+        # if torch.distributed.get_rank() == 0:
+        #     import pdb; pdb.set_trace()
+        # torch.distributed.barrier()
         experts_range = range(self.num_experts)
         w1_list = [self.w13_list[i].weight for i in experts_range]
         w2_list = [self.w2_list[i].weight for i in experts_range]

From 215d31c794b2254abb13f0ca1c6d9bf29b9eabde Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Fri, 21 Mar 2025 15:54:23 +0200
Subject: [PATCH 09/30] fix OoM

Change-Id: I9c9a72115a72a9cdc762d084227cf114588a1ef7
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 .../torch/algorithms/fp8_quant/_core/quantize.py            | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
index c2d5b1b0222..967e3fe8ac9 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -208,15 +208,13 @@ def inspect_tensor(tensor, msg=""):
     show_mem_info("after convert_fp16_to_bf16")
     cur_accelerator.synchronize()
     show_mem_info("after synchronize")
-    if torch.distributed.get_rank() == 0:
-        import pdb; pdb.set_trace()
-    torch.distributed.barrier()
-
 
 def postporcess_after_convert_(model):
     for _, mod in model.named_modules():
         if hasattr(mod, "post_process"):
             mod.post_process()
+            # Note: It is very important to synchronize after each post_process to avoid OoM.
+            cur_accelerator.synchronize()
 
 def prepare_model_with_dummy_measurement(model, mod_list, scaling_method_name, scale_config):
     """Aim for loading, replace module with patched module for model on meta device.

From e8d44f4494a9b5a257c6f453496fd32a903d2215 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Fri, 21 Mar 2025 15:55:07 +0200
Subject: [PATCH 10/30] refine log

Change-Id: I664f0935e3afa076c495acabdd2cc8e63ab38965
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 .../torch/algorithms/fp8_quant/_core/quantize.py              | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
index 967e3fe8ac9..e725a1bd064 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -167,7 +167,7 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name,
                                                                 scale_config, save_file)
                 if not config.cfg["fake_quant"] and mod_default_dict[mod_type_str].should_measure_and_quant:
                     quantize_params(mod, mod_extra_config)
-                logger.debug(f"patching module {name}")
+                # logger.debug(f"patching module {name}")
                 patch_module(mod, mod_extra_config, mod_default_dict)
                 name = origin_name
                 patched_modules.append(name)
@@ -175,7 +175,7 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name,
                 htcore.mark_step()
                 logger.debug("Patched module name: %s", name)
             cur_accelerator.synchronize()
-            logger.info("Patched module name: %s", name)
+            # logger.info("Patched module name: %s", name)
             # show_mem_info()
     if save_file: # cache calculated scales
         save_scales(model, scales_obj, scales_file_format, scale_file + ".npz")

From 13df72f692df09e3106c2e875af171d66bccdd89 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Fri, 21 Mar 2025 16:07:41 +0200
Subject: [PATCH 11/30] brr after convert

Change-Id: I95fdb8028e58cd4ba9b642b9af716d39b4cb7283
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
index e725a1bd064..8de3aeb51c2 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -203,11 +203,11 @@ def inspect_tensor(tensor, msg=""):
     show_mem_info("after move all")
     postporcess_after_convert_(model)
     show_mem_info("after post process")
-    torch.distributed.barrier()
     convert_fp16_to_bf16(model)
     show_mem_info("after convert_fp16_to_bf16")
     cur_accelerator.synchronize()
     show_mem_info("after synchronize")
+    torch.distributed.barrier()
 
 def postporcess_after_convert_(model):
     for _, mod in model.named_modules():

From 325f69e8809514790bc210fdeab0b300cd09bca8 Mon Sep 17 00:00:00 2001
From: Yi <yi4.liu@intel.com>
Date: Mon, 24 Mar 2025 04:47:50 +0200
Subject: [PATCH 12/30] remove NUM_EXPERTS_GROUPS

---
 neural_compressor/common/utils/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/common/utils/__init__.py b/neural_compressor/common/utils/__init__.py
index 93133e30571..0ded3dcc90d 100644
--- a/neural_compressor/common/utils/__init__.py
+++ b/neural_compressor/common/utils/__init__.py
@@ -28,8 +28,8 @@
 VLLM_TP_SIZE = int(os.getenv("VLLM_TP_SIZE", "8"))
 VLLM_EP_SIZE = int(os.getenv("VLLM_EP_SIZE", VLLM_TP_SIZE))
 NUM_EXPERTS_PER_EP_RANK = DEEPSEEK_EXPERTS // VLLM_EP_SIZE  # 32
-NUM_EXPERTS_GROUPS = int(os.getenv("NUM_EXPERTS_GROUPS", 8))
-NUM_EXPERTS_PER_GROUP_PER_RANK = NUM_EXPERTS_PER_EP_RANK // NUM_EXPERTS_GROUPS # 4
+VLLM_MOE_N_SLICE = int(os.getenv("VLLM_MOE_N_SLICE", 8))
+NUM_EXPERTS_PER_GROUP_PER_RANK = NUM_EXPERTS_PER_EP_RANK // VLLM_MOE_N_SLICE # 4
 FUSED_MOE_EXPERTS = NUM_EXPERTS_PER_GROUP_PER_RANK  # 4
 
 logger.warning_once(
@@ -37,7 +37,7 @@
         f"INC uses VLLM_TP_SIZE={VLLM_TP_SIZE},\n"
         f"VLLM_EP_SIZE={VLLM_EP_SIZE},\n"
         f"NUM_EXPERTS_PER_EP_RANK={NUM_EXPERTS_PER_EP_RANK},\n"
-        f"NUM_EXPERTS_GROUPS={NUM_EXPERTS_GROUPS},\n"
+        f"VLLM_MOE_N_SLICE={VLLM_MOE_N_SLICE},\n"
         f"NUM_EXPERTS_PER_GROUP_PER_RANK={NUM_EXPERTS_PER_GROUP_PER_RANK},\n"
         f"FUSED_MOE_EXPERTS={FUSED_MOE_EXPERTS}"
     )

From ea8c842ab9855e67ffbef69e2403aec18cc3ed5f Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Mon, 24 Mar 2025 13:34:17 +0800
Subject: [PATCH 13/30] Refine R1 WOQ Requant  (#2150)

* rename moe op

Signed-off-by: Yi <yi4.liu@intel.com>

* refine moe

Signed-off-by: Yi <yi4.liu@intel.com>

---------

Signed-off-by: Yi <yi4.liu@intel.com>
---
 .../torch/algorithms/fp8_quant/_core/common.py   |  2 +-
 .../fp8_quant/_quant_common/helper_modules.py    | 16 +++++++---------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
index fda18e9c2d6..20ea48ddd55 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -232,7 +232,7 @@ def convert_scales_to_tensors_dict(scales_obj, scales_file_format, hp_dtype, dev
     "MoeMatmul": ModuleInfo("linear", PatchedMoeMatmul),
     "MoeFP8Matmul": ModuleInfo("linear", PatchedMoeFP8Matmul),
     "ReplicatedLinear": ModuleInfo("linear", PatchedReplicatedLinear),
-    "DynamicMoeRuntimeDequantFP8": ModuleInfo("dynamic_moe", PatchedDynamicMoeRuntimeDequantFP8),
+    "VllmMixtureOfExpertsOpFP8": ModuleInfo("dynamic_moe", PatchedVllmMixtureOfExpertsOpFP8),
     # FIXME (Yi) revert change
     "FusedMoE": ModuleInfo("linear", PatchedMixtralMoE, False),
     # "GaudiMixtralSparseMoeBlock": ModuleInfo("dynamic_moe", PatchedGaudiMixtralSparseMoeBlock),
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
index a16df85d5d4..a2cd3ac258f 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -847,7 +847,7 @@ def extra_repr(self) -> str:
             f"quant_mode:{quant_mode}, {get_current_repr(self, *member_names)}",
         )
 
-class PatchedDynamicMoeRuntimeDequantFP8(PatchedVllmMixtureOfExpertsOpV1):
+class PatchedVllmMixtureOfExpertsOpFP8(PatchedVllmMixtureOfExpertsOpV1):
     def _post_init_for_quant(self):
         pass
 
@@ -871,15 +871,13 @@ def forward_measure(
     ):
         hidden_states = x
         measure_input((hidden_states,), observer=self._mod_extra_config.inputs)
-        # Assume moe_n_slice is 1
+        # FIXME: (Yi) Assume moe_n_slice is 1, remove it?
         assert moe_n_slice == 1, f"moe_n_slice is {moe_n_slice}, expected 1"
-        i = 0
-        # for i in range(moe_n_slice):
-        min_expert = i * n_expert_slice
-        max_expert = (i + 1) * n_expert_slice
+        min_expert = self.experts_min
+        max_expert = self.experts_max
         w13_list_slice = []
         w2_list_slice = []
-        for j in range(min_expert, max_expert):
+        for j in range(self.num_experts):
             w13_list_slice.append(self.w13_list[j].get_dequant_weight())
             w2_list_slice.append(self.w2_list[j].get_dequant_weight())
 
@@ -891,8 +889,8 @@ def forward_measure(
             w3=w2_list_slice,
             permuted_weights=True,
             activation="silu",
-            experts_min=min_expert + ep_shift,
-            experts_max=max_expert - 1 + ep_shift,
+            experts_min=min_expert,
+            experts_max=max_expert,
             measurement_mode=True,  # <=============
         )
         output_measure_list = [output]

From 653ca69158e1e6f6c078786fe4d53dc9bf78cfbb Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Mon, 24 Mar 2025 11:16:07 +0200
Subject: [PATCH 14/30] rename func names

Change-Id: I1ebeb32c9c8cae1d6ceb400729c6dced5d56a9a5
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 .../torch/algorithms/fp8_quant/_core/common.py            | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
index 20ea48ddd55..99a31ffe4e4 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -44,10 +44,10 @@
 
 def maybe_dequant_original_fp8_weight(mod: torch.nn.Module, param: torch.Tensor):
     if param.dtype in [torch.float8_e4m3fn]:
-        if hasattr(mod, "get_post_process_weights_func"):
-            post_process_weights_func = mod.get_post_process_weights_func()
-            if post_process_weights_func is not None:
-                param = post_process_weights_func(mod)
+        if hasattr(mod, "get_dequant_weights_func"):
+            dequant_weights_func = mod.get_dequant_weights_func()
+            if dequant_weights_func is not None:
+                param = dequant_weights_func(mod)
     return param
 
 _mod_types = {

From be597d5b5696fe8e8be9156ec6bf22905bfc9c97 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 25 Mar 2025 16:05:13 +0200
Subject: [PATCH 15/30] remove debug info

Change-Id: I57dc09d7c7db1178ae2a55ec74c6de9c8d488e24
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 .../torch/algorithms/fp8_quant/_core/quantize.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
index 8de3aeb51c2..c16f59a74f2 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -175,8 +175,6 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name,
                 htcore.mark_step()
                 logger.debug("Patched module name: %s", name)
             cur_accelerator.synchronize()
-            # logger.info("Patched module name: %s", name)
-            # show_mem_info()
     if save_file: # cache calculated scales
         save_scales(model, scales_obj, scales_file_format, scale_file + ".npz")
         save_scales(model, scales_obj, scales_file_format, scale_file + ".json")
@@ -184,20 +182,6 @@ def prepare_model(model, mod_list, measurement, scale_file, scaling_method_name,
     logger.debug("Patched modules: %s", patched_modules)
     logger.debug("Total patched modules: %d", len(patched_modules))
     
-    
-    def inspect_tensor(tensor, msg=""):
-        
-        if "cpu" in str(tensor.device):
-            logger.info(f"{msg}: tensor dtype: {tensor.dtype}, tensor shape: {tensor.shape}, deveice: {tensor.device}")
-            # if torch.distributed.get_rank() == 0:
-            #     import pdb; pdb.set_trace()
-            # torch.distributed.barrier()
-            
-    for pname, param in model.named_parameters():
-        inspect_tensor(param, pname)
-    # check buffer
-    for bname, buffer in model.named_buffers():
-        inspect_tensor(buffer, bname)
     show_mem_info("before move all")
     model = model.to(cur_accelerator.name())
     show_mem_info("after move all")

From f72e7828827c94e286062cb3a673c6d1d1167bfe Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Mon, 31 Mar 2025 16:39:44 +0300
Subject: [PATCH 16/30] force install pt

Change-Id: I8195c4ac812396319843c14b1af9721857516fca
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 setup.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index d0be2291d6f..c628b1037c2 100644
--- a/setup.py
+++ b/setup.py
@@ -102,7 +102,9 @@ def get_build_version():
     # https://github.com/pytorch/pytorch/pull/114662
     ext_modules = []
     cmdclass = {}
-
+    
+    
+    
     if "pt" in sys.argv:
         sys.argv.remove("pt")
         cfg_key = "neural_compressor_pt"
@@ -110,7 +112,9 @@ def get_build_version():
     if "tf" in sys.argv:
         sys.argv.remove("tf")
         cfg_key = "neural_compressor_tf"
-
+    # FIXME: (Yi) force install neural_compressor_pt
+    print(f"Forcing install neural_compressor_pt")
+    cfg_key = "neural_compressor_pt"
     project_name = PKG_INSTALL_CFG[cfg_key].get("project_name")
     include_packages = PKG_INSTALL_CFG[cfg_key].get("include_packages") or {}
     package_data = PKG_INSTALL_CFG[cfg_key].get("package_data") or {}

From c1f938a05d1d78c83a27bc3775e302d2d4b5cec7 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Mon, 31 Mar 2025 16:49:16 +0300
Subject: [PATCH 17/30] install all reqs

Change-Id: Ibf48a8bc7c9eb9947345fdf617ccf483f70fb2b8
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 setup.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c628b1037c2..5d2fe1f9f4a 100644
--- a/setup.py
+++ b/setup.py
@@ -75,7 +75,12 @@ def get_build_version():
             ],
         ),
         "package_data": {"": ["*.json"]},
-        "install_requires": fetch_requirements("requirements_pt.txt"),
+        # FIXME: (Yi) force install neural_compressor_pt
+        # "install_requires": fetch_requirements("requirements_pt.txt"),
+        "install_requires": fetch_requirements("requirements.txt"),
+        "extras_require": {
+            "pt": fetch_requirements("requirements_pt.txt"),
+        }
     },
     # 3.x tf binary build config, pip install neural-compressor-tf, install 3.x TensorFlow API.
     "neural_compressor_tf": {

From c748f5e7f143ff0f50ac9ea05a54364eb73866f2 Mon Sep 17 00:00:00 2001
From: Yi <yi4.liu@intel.com>
Date: Tue, 1 Apr 2025 06:33:13 +0300
Subject: [PATCH 18/30] update dequant func

---
 .../torch/algorithms/fp8_quant/_core/quantize.py            | 1 +
 .../fp8_quant/_core/scale_methods/ops_quantizer.py          | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
index c16f59a74f2..b73a69ed466 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -83,6 +83,7 @@ def quantize_params(mod, mod_extra_config):
         quantized_param = quantizer(param.to(cur_accelerator.name()))
         delattr(mod, param_name)
         setattr(mod, param_name, nn.Parameter(quantized_param))
+        mod._updated_weight = True
         quantized_param = getattr(mod, param_name)
         quantized_param.requires_grad_(False)
         cur_accelerator.synchronize()
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py
index 738c12e0ff7..fe23a92a6c3 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py
@@ -20,7 +20,8 @@
 from .scales_method import QuantTensorType
 from ..quant_dequant import DequantOutput, QuantDequant, QuantDequantNone, QuantInput
 from neural_compressor.common import utils as inc_utils
-
+# from neural_compressor.torch.algorithms.fp8_quant.utils import 
+from neural_compressor.torch.algorithms.fp8_quant._core.common import maybe_dequant_original_fp8_weight
 class BaseOpQuantizer:
 
     def __init__(self, config, mod, measurement, params, op_type):
@@ -96,7 +97,8 @@ def get_scales_module_config(self):
         rescaled_weight = self.mod.weight if hasattr(self.mod, 'weight') else None
         if self.weight_ich_scale_calc is not None:
             weight_scales_in_ch = self.weight_ich_scale_calc.calc_scales(input_scales[0], QuantTensorType.CONST)
-            rescaled_weight = torch.div(self.mod.weight, weight_scales_in_ch.reshape([1, -1]))
+            bf16_weight = maybe_dequant_original_fp8_weight(self.mod, self.mod.weight)
+            rescaled_weight = torch.div(bf16_weight, weight_scales_in_ch.reshape([1, -1]))
         weights_scales_out_ch = self.weight_och_scale_calc.calc_scales(rescaled_weight, QuantTensorType.CONST)
         params_config = (
             {"weight": weights_scales_out_ch}

From eb5f04d93d46991b89a1af82296d960b70f44d73 Mon Sep 17 00:00:00 2001
From: Yi <yi4.liu@intel.com>
Date: Tue, 1 Apr 2025 18:27:03 +0800
Subject: [PATCH 19/30] update

---
 neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
index b73a69ed466..201172475c5 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -83,7 +83,8 @@ def quantize_params(mod, mod_extra_config):
         quantized_param = quantizer(param.to(cur_accelerator.name()))
         delattr(mod, param_name)
         setattr(mod, param_name, nn.Parameter(quantized_param))
-        mod._updated_weight = True
+        # Note: in case of re-quantize the fp8 weights, we need to set `updated_fp8_weight` to True
+        mod.updated_fp8_weight = True
         quantized_param = getattr(mod, param_name)
         quantized_param.requires_grad_(False)
         cur_accelerator.synchronize()

From fcf303148211225b6af435413cffa14457b365c6 Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Sat, 5 Apr 2025 15:55:55 +0800
Subject: [PATCH 20/30] Update PatchedVLLMKVCache for deepseek performance
 (#2165)

---
 .../fp8_quant/_quant_common/helper_modules.py | 32 +++++++------------
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
index a2cd3ac258f..5652327fd6e 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -1082,10 +1082,10 @@ def forward_qdq(self, input, *args, **kwargs):
         output_cache = self.orig_mod(qinput, *args, **kwargs)
         return output_cache
 
-    # def forward_quant(self, input, *args, **kwargs):
-    #     qinput = self.quant_input(input)
-    #     output_cache = self.orig_mod(qinput, *args, **kwargs)
-    #     return self.dequant_output(output_cache)
+    def forward_quant(self, input, *args, **kwargs):
+        qinput = self.quant_input(input)
+        output_cache = self.orig_mod(qinput, *args, **kwargs)
+        return self.dequant_output(output_cache)
 
     def forward_measure(self, input, *args, **kwargs):
         measure_input((input, ), self._mod_extra_config.inputs)
@@ -1093,22 +1093,12 @@ def forward_measure(self, input, *args, **kwargs):
         measure_output((output_cache, ), self._mod_extra_config.outputs)
         return output_cache
 
-    # def fetch_from_cache(self, cache, blocks, permutations=None):
-    #     # quant_cache = self.quant_input(cache)
-    #     quant_cache = cache
-    #     if permutations:
-    #         output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks, permutations)
-    #         for i in range(len(output_cache)):
-    #             output_cache[i] = self.dequant_output(output_cache[i])
-    #         return output_cache
-    #     output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks)
-    #     return self.dequant_output(output_cache)
-    
-    def forward_quant(self, input, *args, **kwargs):
-        qinput = self.quant_input(input)
-        return self.orig_mod(qinput, *args, **kwargs)
-
-    def fetch_from_cache(self, quant_cache, blocks, permutations=None):
+    def fetch_from_cache(self, cache, blocks, permutations=None):
+        # TODO: Remove this workaround in next release [SW-221595]
+        if cache.dtype != self.lp_dtype:
+            quant_cache = self.quant_input(cache)
+        else:
+            quant_cache = cache
         if permutations:
             output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks, permutations)
             for i in range(len(output_cache)):
@@ -1116,7 +1106,7 @@ def fetch_from_cache(self, quant_cache, blocks, permutations=None):
             return output_cache
         output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks)
         return self.dequant_output(output_cache)
-    
+
     def extra_repr(self) -> str:
         return f"PatchedVLLMKVCache"
 

From 39d1ccfbbf19797c0262432236fd311eb5752589 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Mon, 7 Apr 2025 04:15:17 +0300
Subject: [PATCH 21/30] align fused moe op

Change-Id: I6a19a4d7492a221785901c464b8d8c8ccaedd898
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 .../algorithms/fp8_quant/_quant_common/helper_modules.py  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
index 5652327fd6e..73aed3eb05c 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -865,14 +865,14 @@ def forward_measure(
         x,
         topk_ids,
         topk_weights,
-        moe_n_slice,
-        n_expert_slice,
-        ep_shift,
+        moe_n_slice=None,
+        n_expert_slice=None,
+        ep_shift=None,
     ):
         hidden_states = x
         measure_input((hidden_states,), observer=self._mod_extra_config.inputs)
         # FIXME: (Yi) Assume moe_n_slice is 1, remove it?
-        assert moe_n_slice == 1, f"moe_n_slice is {moe_n_slice}, expected 1"
+        # assert moe_n_slice == 1, f"moe_n_slice is {moe_n_slice}, expected 1"
         min_expert = self.experts_min
         max_expert = self.experts_max
         w13_list_slice = []

From c3c60a9d602f7640430828502d3e2ef1e0d7f39f Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Mon, 7 Apr 2025 06:57:55 +0300
Subject: [PATCH 22/30] fix moe op

Change-Id: I94138049a5609accae94aeec40c9b82382eb705d
---
 neural_compressor/torch/algorithms/fp8_quant/_core/measure.py | 3 +++
 .../algorithms/fp8_quant/_quant_common/helper_modules.py      | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
index 36ea0616f94..1c1e6fefc7e 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
@@ -149,6 +149,9 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N
                     logger.info(f"Patching measure module {name} {mod.__class__}")
                     num_info += 1
                 set_hqt_config(mod, top_level_config)  # set config in the module, as it consumed by the patched module
+                # override default number of outputs for dynamic moe
+                mod_types[mod_type].num_outputs = mod.num_experts+1
+                logger.warning(f"Dynamic moe num_outputs set to {mod.num_experts+1}")
                 mod_extra_config = (
                     init_measure_object(
                         mod,
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
index 73aed3eb05c..b80b6e8a3e5 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -907,8 +907,8 @@ def forward_quant(
         x,
         topk_ids,
         topk_weights,
-        moe_n_slice,
-        n_expert_slice,
+        moe_n_slice=None,
+        n_expert_slice=None,
         ep_shift=None,
     ):
         hidden_states = x

From ccb90e5d7144c2111186710cdea289aa9f9b3fb9 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Mon, 7 Apr 2025 07:07:12 +0300
Subject: [PATCH 23/30] fix moe op num_outputs

Change-Id: Ic3e689f026497908f409d9ffc2a69971fe36d0cc
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 .../torch/algorithms/fp8_quant/_core/measure.py            | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
index 1c1e6fefc7e..ef9bdc1c7b4 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
@@ -149,9 +149,10 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N
                     logger.info(f"Patching measure module {name} {mod.__class__}")
                     num_info += 1
                 set_hqt_config(mod, top_level_config)  # set config in the module, as it consumed by the patched module
-                # override default number of outputs for dynamic moe
-                mod_types[mod_type].num_outputs = mod.num_experts+1
-                logger.warning(f"Dynamic moe num_outputs set to {mod.num_experts+1}")
+                if mod_type == "dynamic_moe" and hasattr(mod, "num_experts"):
+                    # override default number of outputs for dynamic moe
+                    mod_types[mod_type].num_outputs = mod.num_experts+1
+                    logger.warning(f"Dynamic moe num_outputs set to {mod.num_experts+1}")
                 mod_extra_config = (
                     init_measure_object(
                         mod,

From 56de6185b59fe6697df4a8715a4e565361cc8750 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Wed, 9 Apr 2025 16:20:29 +0800
Subject: [PATCH 24/30] dequant for pts as well (#2167)

Change-Id: Ic1b11c25fa7402ab0b4e2c0cca0939a89d20ffa2

Signed-off-by: Yi Liu <yiliu4@habana.ai>
Co-authored-by: Yi Liu <yiliu4@habana.ai>
---
 .../fp8_quant/_core/scale_methods/ops_quantizer.py           | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py
index fe23a92a6c3..697001ff818 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py
@@ -95,10 +95,11 @@ def get_scales_module_config(self):
         input_scales = self.calc_input_scales(num_of_inputs=1)
         output_measurement = self.measurement.outputs[0] if self.measurement is not None else []
         rescaled_weight = self.mod.weight if hasattr(self.mod, 'weight') else None
+        if rescaled_weight is not None:
+            rescaled_weight = maybe_dequant_original_fp8_weight(self.mod, rescaled_weight)
         if self.weight_ich_scale_calc is not None:
             weight_scales_in_ch = self.weight_ich_scale_calc.calc_scales(input_scales[0], QuantTensorType.CONST)
-            bf16_weight = maybe_dequant_original_fp8_weight(self.mod, self.mod.weight)
-            rescaled_weight = torch.div(bf16_weight, weight_scales_in_ch.reshape([1, -1]))
+            rescaled_weight = torch.div(rescaled_weight, weight_scales_in_ch.reshape([1, -1]))
         weights_scales_out_ch = self.weight_och_scale_calc.calc_scales(rescaled_weight, QuantTensorType.CONST)
         params_config = (
             {"weight": weights_scales_out_ch}

From e340ed95de92f74b1d135c3b4fc5b4f313c06772 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Sat, 12 Apr 2025 20:29:46 +0800
Subject: [PATCH 25/30] correct linear measure (#2170)

Signed-off-by: Yi <yi4.liu@intel.com>
---
 .../fp8_quant/_quant_common/helper_modules.py | 42 ++++++++-----------
 1 file changed, 17 insertions(+), 25 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
index b80b6e8a3e5..182f14311fc 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -416,22 +416,14 @@ def forward_quant(self, input):
         return self.post_all_reduce(dqoutput)
 
     def forward_measure(self, input):
-        # if torch.distributed.get_rank() == 0:
-        #     import pdb; pdb.set_trace()
-        # torch.distributed.barrier()
-        # resolved_input = self.resolve_input(input)
-        # measure_input((resolved_input,), observer=self._mod_extra_config.inputs)
+        resolved_input = self.resolve_input(input)
+        measure_input((resolved_input,), observer=self._mod_extra_config.inputs)
         # output = torch.matmul(resolved_input, self.weight.transpose(-1, -2))
-        # measure_output((output,), self._mod_extra_config.outputs)
-        # if self.reduce_results:
-        #     output = self.collective_func(output)
-        # return self.post_all_reduce(output)
-
-        # FIXME: It is not fully correct here
-        measure_input((input,), observer=self._mod_extra_config.inputs)
-        output, output_bias = self.orig_mod(input)
+        output = self.orig_mod.quant_method.apply(self.orig_mod, resolved_input)
         measure_output((output,), self._mod_extra_config.outputs)
-        return output, output_bias
+        if self.reduce_results:
+            output = self.collective_func(output)
+        return self.post_all_reduce(output)
 
     def post_all_reduce(self, output):
         assert (
@@ -482,21 +474,21 @@ def forward_quant(self, input):
         return self.post_all_reduce(dqoutput)
 
     def forward_measure(self, input):
-        # if torch.distributed.get_rank() == 0:
-        #     import pdb; pdb.set_trace()
         measure_input((input,), observer=self._mod_extra_config.inputs)
-        # FIXME: it is not fully correct here, 
-        output, output_bias = self.orig_mod(input)
+        output = self.orig_mod.quant_method.apply(self.orig_mod, input)
         measure_output((output,), self._mod_extra_config.outputs)
+        output, output_bias = self.add_bias(output)
+        if self.gather_output:
+            output = self.collective_func(output)
         return output, output_bias
 
-        # output = torch.matmul(input, self.weight.transpose(-1, -2))
-
-        # output = torch.matmul(input, self.weight.transpose(-1, -2))
-        # measure_output((output,), self._mod_extra_config.outputs)
-        # if self.gather_output:
-        #     output = self.collective_func(output)
-        # return self.post_all_reduce(output)
+    def add_bias(self, output):
+        if not self.skip_bias_add:
+            output = output + self.bias if self.bias is not None else output
+            output_bias = None
+        else:
+            output_bias = self.bias
+        return output, output_bias
 
     def post_all_reduce(self, output):
         if not self.skip_bias_add:

From 887787eec19ae6b8c457ceb7de301195e884fac9 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Fri, 16 May 2025 08:28:39 +0800
Subject: [PATCH 26/30] add INC_MEASUREMENT_DUMP_PATH_PREFIX (#2210)

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../fp8_quant/_quant_common/quant_config.py   | 21 ++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
index e4ebdefaa11..ad334f8bfbc 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
@@ -24,7 +24,7 @@
 import torch
 
 from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator, INCAcceleratorType
-from ..utils.logger import logger
+from neural_compressor.torch.utils import logger
 
 try:
     world_size = torch.distributed.get_world_size()
@@ -225,6 +225,25 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
                 else:
                     measured_global_config[keys] = custom_config[keys]
 
+        INC_MEASUREMENT_DUMP_PATH_PREFIX = os.getenv("INC_MEASUREMENT_DUMP_PATH_PREFIX", None)
+        if INC_MEASUREMENT_DUMP_PATH_PREFIX is not None:
+            dump_stats_path = os.path.join(INC_MEASUREMENT_DUMP_PATH_PREFIX, measured_global_config["dump_stats_path"])
+            measured_global_config["dump_stats_path"] = dump_stats_path
+            logger.info(
+                f"INC_MEASUREMENT_DUMP_PATH_PREFIX is set to {INC_MEASUREMENT_DUMP_PATH_PREFIX}, dump_stats_path is set to {dump_stats_path}"
+            )
+        # check if the directory exists
+
+        dir_path = os.path.dirname(measured_global_config["dump_stats_path"])
+        if not os.path.exists(dir_path):
+            raise ValueError(
+                (
+                    f"The measurement dump directory '{dir_path}' does not exist,"
+                    f" the path is determined by the environment variable INC_MEASUREMENT_DUMP_PATH_PREFIX"
+                    f" and the dump_stats_path in the quantization config file."
+                )
+            )
+
         # If seperate_measure_files is True (default value), then it is assumed that there are multiple distinct measure and scale files
         # and they are stored in / loaded from paths with the correct index as a suffix. Else, only one is searched for.
         measured_global_config["local_rank"] = (

From 971a7e55b017084ae3b4e4770ef4c678eb2b0f9a Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Fri, 16 May 2025 15:51:39 +0800
Subject: [PATCH 27/30] fix path check (#2212)

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../torch/algorithms/fp8_quant/_quant_common/quant_config.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
index ad334f8bfbc..88a703e7c6f 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
@@ -233,9 +233,10 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
                 f"INC_MEASUREMENT_DUMP_PATH_PREFIX is set to {INC_MEASUREMENT_DUMP_PATH_PREFIX}, dump_stats_path is set to {dump_stats_path}"
             )
         # check if the directory exists
-
+        
         dir_path = os.path.dirname(measured_global_config["dump_stats_path"])
-        if not os.path.exists(dir_path):
+        abs_path = os.path.abspath(dir_path)
+        if not (os.path.exists(dir_path) or os.path.exists(abs_path)):
             raise ValueError(
                 (
                     f"The measurement dump directory '{dir_path}' does not exist,"

From 0bd4390d63153a59914fcf1d5735f9b3d6aae11e Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Fri, 16 May 2025 16:26:57 +0800
Subject: [PATCH 28/30] Naive Scaling (#2211)

* add INC_MEASUREMENT_DUMP_PATH_PREFIX

Signed-off-by: yiliu30 <yi4.liu@intel.com>

* add naive scaling

Signed-off-by: yi <yi>

* Update neural_compressor/torch/utils/environ.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

---------

Signed-off-by: yiliu30 <yi4.liu@intel.com>
Signed-off-by: yi <yi>
Co-authored-by: yi <yi>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .../torch/algorithms/fp8_quant/_core/fp_utils.py            | 6 ++++++
 .../fp8_quant/_core/scale_methods/scale_method_factory.py   | 6 +++++-
 neural_compressor/torch/utils/environ.py                    | 4 ++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py b/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py
index 0ea9492e6f3..5f229ea8a60 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py
@@ -17,6 +17,9 @@
 from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator, INCAcceleratorType
 cur_accelerator = auto_detect_accelerator()
 
+from neural_compressor.torch.utils import environ
+from neural_compressor.common.utils import logger
+
 descale_fcn = lambda x, scale: torch.mul(x, scale)
 scale_fcn = lambda x, scale: torch.div(x, scale)
 cast_fcn = lambda x, dtype: x.to(dtype=dtype)
@@ -106,6 +109,9 @@ def get_fp8_hw_alligned_scales(dtype, device):
 }
 
 def calc_maxabs_scale(xmaxabs, fullscale, backoff=1):
+    if environ.INC_FORCE_NAIVE_SCALING:
+        backoff = 1.0
+        logger.warning_once(f"Enabled naive scaling, backoff is set to {backoff}")
     scale = xmaxabs / (fullscale * backoff)
     return scale
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scale_method_factory.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scale_method_factory.py
index b1517c712b1..91d97fb6fda 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scale_method_factory.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/scale_method_factory.py
@@ -16,7 +16,8 @@
 from .round_scales_function import *
 from ..common import get_device_type_for_scales
 from .scales_method import *
-
+from neural_compressor.torch.utils import environ
+from neural_compressor.common.utils import logger
 
 class QuantTensorName(Enum):
     INPUT = auto()
@@ -40,6 +41,9 @@ class ScaleValueType(Enum):
 
 def parse_rounding_method(config, device_for_scales):
     round_method = ScaleIdentity()
+    if environ.INC_FORCE_NAIVE_SCALING:
+        logger.warning_once("Enabled naive scaling")
+        return round_method
     if "single" in config and "hw" in config:
         round_method = ScaleHwAlignedFixed(device_for_scales)
     elif "unit" in config:
diff --git a/neural_compressor/torch/utils/environ.py b/neural_compressor/torch/utils/environ.py
index 623d3446232..7c40af92710 100644
--- a/neural_compressor/torch/utils/environ.py
+++ b/neural_compressor/torch/utils/environ.py
@@ -32,6 +32,10 @@
     world_size = int(os.getenv("WORLD_SIZE", "-1"))
 
 
+
+INC_FORCE_NAIVE_SCALING = os.getenv("INC_FORCE_NAIVE_SCALING", "0").lower() in ["1", "true"]
+
+
 ################ Check imported sys.module first to decide behavior #################
 def is_ipex_imported() -> bool:
     """Check whether intel_extension_for_pytorch is imported."""

From fdff2f55be7e72e5448dd42ddb1f5894a18d6351 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Thu, 29 May 2025 18:43:27 +0800
Subject: [PATCH 29/30] Check patch for quant only (#2217)

* fix path check

Signed-off-by: yiliu30 <yi4.liu@intel.com>

* check-patch-for-quant-only

Signed-off-by: yiliu30 <yi4.liu@intel.com>

---------

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../fp8_quant/_quant_common/quant_config.py      | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
index 88a703e7c6f..f8ad6bf0926 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
@@ -236,14 +236,16 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
         
         dir_path = os.path.dirname(measured_global_config["dump_stats_path"])
         abs_path = os.path.abspath(dir_path)
-        if not (os.path.exists(dir_path) or os.path.exists(abs_path)):
-            raise ValueError(
-                (
-                    f"The measurement dump directory '{dir_path}' does not exist,"
-                    f" the path is determined by the environment variable INC_MEASUREMENT_DUMP_PATH_PREFIX"
-                    f" and the dump_stats_path in the quantization config file."
+        quant_mode = measured_global_config["mode"]
+        if quant_mode == QuantMode.QUANTIZE:
+            if not (os.path.exists(dir_path) or os.path.exists(abs_path)):
+                raise ValueError(
+                    (
+                        f"The measurement dump directory '{dir_path}' does not exist,"
+                        f" the path is determined by the environment variable INC_MEASUREMENT_DUMP_PATH_PREFIX"
+                        f" and the dump_stats_path in the quantization config file."
+                    )
                 )
-            )
 
         # If seperate_measure_files is True (default value), then it is assumed that there are multiple distinct measure and scale files
         # and they are stored in / loaded from paths with the correct index as a suffix. Else, only one is searched for.

From 3e879b106105ee3b834be8f0f5354b67df092c7f Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Thu, 26 Jun 2025 19:48:30 +0800
Subject: [PATCH 30/30] refine code (#2228)

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/common/utils/__init__.py    | 22 -------------------
 .../algorithms/fp8_quant/_core/common.py      |  2 +-
 .../algorithms/fp8_quant/_core/measure.py     |  2 +-
 3 files changed, 2 insertions(+), 24 deletions(-)

diff --git a/neural_compressor/common/utils/__init__.py b/neural_compressor/common/utils/__init__.py
index 0ded3dcc90d..7d5581f1bb5 100644
--- a/neural_compressor/common/utils/__init__.py
+++ b/neural_compressor/common/utils/__init__.py
@@ -21,28 +21,6 @@
 from neural_compressor.common.utils.utility import *
 
 
-# FIXME: (Yi) REMOVE BELOW CODE
-import os
-
-DEEPSEEK_EXPERTS = 256
-VLLM_TP_SIZE = int(os.getenv("VLLM_TP_SIZE", "8"))
-VLLM_EP_SIZE = int(os.getenv("VLLM_EP_SIZE", VLLM_TP_SIZE))
-NUM_EXPERTS_PER_EP_RANK = DEEPSEEK_EXPERTS // VLLM_EP_SIZE  # 32
-VLLM_MOE_N_SLICE = int(os.getenv("VLLM_MOE_N_SLICE", 8))
-NUM_EXPERTS_PER_GROUP_PER_RANK = NUM_EXPERTS_PER_EP_RANK // VLLM_MOE_N_SLICE # 4
-FUSED_MOE_EXPERTS = NUM_EXPERTS_PER_GROUP_PER_RANK  # 4
-
-logger.warning_once(
-    (
-        f"INC uses VLLM_TP_SIZE={VLLM_TP_SIZE},\n"
-        f"VLLM_EP_SIZE={VLLM_EP_SIZE},\n"
-        f"NUM_EXPERTS_PER_EP_RANK={NUM_EXPERTS_PER_EP_RANK},\n"
-        f"VLLM_MOE_N_SLICE={VLLM_MOE_N_SLICE},\n"
-        f"NUM_EXPERTS_PER_GROUP_PER_RANK={NUM_EXPERTS_PER_GROUP_PER_RANK},\n"
-        f"FUSED_MOE_EXPERTS={FUSED_MOE_EXPERTS}"
-    )
-)
-
 import sys
 import pdb
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
index 99a31ffe4e4..998a97c76ba 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -59,7 +59,7 @@ def maybe_dequant_original_fp8_weight(mod: torch.nn.Module, param: torch.Tensor)
     "dynamic_moe": ModuleType(
         1,
         [],
-        inc_utils.FUSED_MOE_EXPERTS + 1,  # FIXME (Yi) # one output, FUSED_MOE_EXPERTS weights
+        8 + 1,  # FIXME (Yi) # one output, FUSED_MOE_EXPERTS weights
         True,
     ),
 }
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
index ef9bdc1c7b4..4f9e9ea1304 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
@@ -152,7 +152,7 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N
                 if mod_type == "dynamic_moe" and hasattr(mod, "num_experts"):
                     # override default number of outputs for dynamic moe
                     mod_types[mod_type].num_outputs = mod.num_experts+1
-                    logger.warning(f"Dynamic moe num_outputs set to {mod.num_experts+1}")
+                    logger.warning_once(f"Dynamic moe num_outputs set to {mod.num_experts+1}")
                 mod_extra_config = (
                     init_measure_object(
                         mod,