AWQ minor performance improvements to smoothing

brian-dellabetta · brian-dellabetta · commit 2b4676682ebe · 2025-06-17T16:44:53.000Z
Signed-off-by: Brian Dellabetta &lt;bdellabe@redhat.com&gt;
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -474,8 +474,8 @@ def _apply_smoothing(self, model: Module) -> None:
             with calibration_forward_context(model), HooksMixin.disable_hooks():
                 # [STEP 3]: Compute output of module
                 # could cache from hook, rather than recomputing here
-                fp16_outputs = self._run_samples(parent_module)
-                if len(fp16_outputs) == 0 or all(f.numel() == 0 for f in fp16_outputs):
+                fp16_output = self._get_flattened_output(parent_module)
+                if fp16_output.numel() == 0:
                     logger.info(
                         f"Skipping smooth_layer {mapping.smooth_name}, no activations "
                         "found to scale. This can occasionally occur in MoE models "
@@ -488,7 +488,7 @@ def _apply_smoothing(self, model: Module) -> None:
 
                 # [STEP 4]: Compute loss
                 best_scales = self._compute_best_scale(
-                    x_mean, w_mean, parent_module, balance_layers, fp16_outputs
+                    x_mean, w_mean, parent_module, balance_layers, fp16_output
                 )
 
             @torch.no_grad()
@@ -541,25 +541,35 @@ def smooth(module):
             v.batch_intermediates.clear()
         self._assert_all_activations_consumed()
 
-    def _run_samples(self, module: Module) -> List[torch.Tensor]:
+    def _get_flattened_output(self, module: Module) -> torch.Tensor:
+        """
+        Returns output of running cached batch inputs through module
+        Output tensor is 1D, as shapes aren't necessary for calculating loss
+        """
         with align_module_device(module):
             outputs = [
                 module(**batch_kwargs)
                 for batch_kwargs in self._parent_args_cache[module]
             ]
-            return [
-                # If Tuple, assume that first argument is the input
-                output[0] if isinstance(output, Tuple) else output
-                for output in outputs
-            ]
+            return torch.cat(
+                [
+                    # If Tuple, assume that first argument is the input
+                    (
+                        output[0].reshape(-1)
+                        if isinstance(output, Tuple)
+                        else output.reshape(-1)
+                    )
+                    for output in outputs
+                ]
+            )
 
     def _compute_best_scale(
         self,
         x_mean: torch.Tensor,
         w_mean: torch.Tensor,
         parent_module: torch.nn.Module,
         linears2scale: List[torch.nn.Linear],
-        fp16_outputs: List[torch.Tensor],
+        fp16_output: torch.Tensor,
     ) -> torch.Tensor:
         """
         Compute loss and select best scales
@@ -618,10 +628,10 @@ def _compute_best_scale(
 
             # W * X
             with HooksMixin.disable_hooks():
-                int_w_outputs = self._run_samples(parent_module)
+                int_w_output = self._get_flattened_output(parent_module)
 
             # compute mean squared error (L2 norm)
-            loss = self._compute_loss(fp16_outputs, int_w_outputs, device)
+            loss = _compute_loss(fp16_output, int_w_output)
 
             history.append(loss)
             if loss < best_error:
@@ -640,34 +650,6 @@ def _compute_best_scale(
 
         return best_scales.detach().cpu()
 
-    @torch.no_grad()
-    def _compute_loss(
-        self,
-        fp16_outputs: List[torch.Tensor],
-        int_w_outputs: List[torch.Tensor],
-        device: torch.device,
-    ) -> torch.Tensor:
-        loss = 0.0
-        num_elements = 0
-
-        # Compute the MSE loss for each batch
-        for fp16_batch, int_w_batch in zip(fp16_outputs, int_w_outputs):
-            batch_loss = (
-                (fp16_batch.to(device) - int_w_batch.to(device))
-                .view(-1)
-                .float()
-                .pow(2)
-                .sum()
-                .item()
-            )
-            loss += batch_loss
-            num_elements += fp16_batch.numel()
-
-        # Normalize the loss by the total number of elements
-        loss /= num_elements
-
-        return loss
-
     def _assert_all_activations_consumed(self):
         """
         Confirm all activations have been consumed
@@ -677,6 +659,17 @@ def _assert_all_activations_consumed(self):
             raise RuntimeError("Some cached activations were not used")
 
 
+@torch.no_grad()
+@torch.compile()
+def _compute_loss(
+    fp16_output: torch.Tensor,
+    int_w_output: torch.Tensor,
+) -> torch.Tensor:
+    """Compute MSE loss for each batch"""
+    return (fp16_output - int_w_output).view(-1).float().pow(2).mean()
+
+
+@torch.compile()
 def _pseudo_quantize_tensor(
     w: torch.Tensor, symmetric: bool = False, bit_width: int = 8, group_size: int = -1
 ):