intel · mengniwang95 · Oct 13, 2025
diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py
@@ -100,6 +100,10 @@ def __init__(
         truncation: bool = False,
         # 0.7
         scheme: Union[str, dict, QuantizationScheme] = "W4A16",
+        # diffusion
+        guidance_scale: float = 7.5,
+        num_inference_steps: int = 50,
+        generator_seed: int = None,
         **kwargs,
     ):
         """Init a AutQRoundQuantizer object.
@@ -172,6 +176,10 @@ def __init__(
             template (Template): The template to specify process for different mllms.
             truncation (bool): Activates truncation to cut input sequences longer than `max_length` to `max_length`.
             scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations.
+            guidance_scale (float): Control how much the image generation process follows the text prompt.
+                                    The more it is, the more closely it follows the prompt (default is 7.5).
+            num_inference_steps (int): The reference number of denoising steps (default is 50).
+            generator_seed (int): A seed that controls the initial noise for image generation (default is None).
 
         Returns:
             The quantized model.
@@ -227,6 +235,9 @@ def __init__(
         self.device_map = device_map
         self.quant_lm_head = quant_lm_head
         self.enable_w4afp8 = self._is_w4afp8()
+        self.guidance_scale = guidance_scale
+        self.num_inference_steps = num_inference_steps
+        self.generator_seed = generator_seed
 
     def _is_w4afp8(self) -> bool:
         return any([v.get("data_type", None) == "fp8_to_int_sym" for v in self.quant_config.values()])
@@ -252,13 +263,16 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
         Returns:
             The quantized model.
         """
+        pipe = kwargs.pop("pipeline", None)
         tokenizer = getattr(model.orig_model, "tokenizer", None)
         if tokenizer is not None:
             delattr(model.orig_model, "tokenizer")
-        else:
+        elif pipe is None:
             tokenizer = "Placeholder"
             self.dataset = CapturedDataloader(model.args_list, model.kwargs_list)
         model = model.orig_model
+        if pipe is not None:
+            model = pipe
         rounder = AutoRound(
             model,
             layer_config=self.layer_config,
@@ -307,6 +321,9 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             truncation=self.truncation,
             enable_torch_compile=self.enable_torch_compile,
             quant_lm_head=self.quant_lm_head,
+            guidance_scale=self.guidance_scale,
+            num_inference_steps=self.num_inference_steps,
+            generator_seed=self.generator_seed,
         )
 
         if self.enable_w4afp8:

diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -608,6 +608,7 @@ def autoround_quantize_entry(
                 "act_data_type": act_data_type,
             }
             layer_config = quant_config.to_dict().get("layer_config", None)
+            dataset = quant_config.to_dict().get("dataset", "NeelNanda/pile-10k")
             output_dir = quant_config.to_dict().get("output_dir", "temp_auto_round")
             enable_full_range = quant_config.enable_full_range
             batch_size = quant_config.batch_size
@@ -642,6 +643,9 @@ def autoround_quantize_entry(
             scheme = quant_config.scheme
             device_map = quant_config.device_map
             quant_lm_head = quant_config.quant_lm_head
+            guidance_scale = quant_config.to_dict().get("guidance_scale", 7.5)
+            num_inference_steps = quant_config.to_dict().get("num_inference_steps", 50)
+            generator_seed = quant_config.to_dict().get("generator_seed", None)
 
     kwargs.pop("example_inputs")
     quantizer = get_quantizer(
@@ -665,6 +669,7 @@ def autoround_quantize_entry(
         batch_size=batch_size,
         amp=amp,
         lr_scheduler=lr_scheduler,
+        dataset=dataset,
         enable_quanted_input=enable_quanted_input,
         enable_minmax_tuning=enable_minmax_tuning,
         lr=lr,
@@ -694,6 +699,9 @@ def autoround_quantize_entry(
         scheme=scheme,
         device_map=device_map,
         quant_lm_head=quant_lm_head,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        generator_seed=generator_seed,
     )
     model = quantizer.execute(model=model, mode=mode, *args, **kwargs)
     model.qconfig = configs_mapping

diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
@@ -228,6 +228,7 @@ def convert(
     model: torch.nn.Module,
     quant_config: BaseConfig = None,
     inplace: bool = True,
+    **kwargs,
 ):
     """Convert the prepared model to a quantized model.
 
@@ -284,6 +285,7 @@ def convert(
                 configs_mapping,
                 example_inputs=example_inputs,
                 mode=Mode.CONVERT,
+                **kwargs,
             )
     setattr(q_model, "is_quantized", True)
     return q_model