Revise & add comments

aladerran · aladerran · commit 2c688cb49584 · 2025-07-21T23:27:07.000+08:00
diff --git a/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py b/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py
@@ -18,10 +18,6 @@
 from llmcompressor.observers.base import Observer
 from llmcompressor.pytorch.utils.helpers import tensor_sparsity
 
-torch._dynamo.config.capture_scalar_outputs = True
-torch._inductor.config.triton.tile_reductions = True
-torch.set_float32_matmul_precision("high")
-
 GPTQ_PRECISION = torch.float32
 
 __all__ = ["make_empty_hessian", "accumulate_hessian", "quantize_weight"]
@@ -74,7 +70,7 @@ def accumulate_hessian(
     return H, num_samples
 
 
-def quantize_weight(
+def quantize_weight_original(
     module: torch.nn.Module,
     quant_args: QuantizationArgs,
     hessians_dict: Dict[torch.nn.Module, torch.Tensor],
@@ -296,6 +292,7 @@ def _process_block(
     quant_max: int,
     sym: bool,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Process a single block of weight columns using with torch.compile support."""
     count = W1.shape[1]
     Q1 = torch.zeros_like(W1)
     Err1 = torch.zeros_like(W1)
@@ -349,11 +346,13 @@ def _quantize_core(
     num_rows: int,
     num_columns: int,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Core GPTQ quantization loop processing weights in blocks."""
     losses = torch.zeros(num_rows, device=W.device, dtype=W.dtype)
 
     for i1 in range(0, num_columns, blocksize):
         i2 = min(i1 + blocksize, num_columns)
 
+        # Extract current block and corresponding Hessian/quantization params
         W1 = W[:, i1:i2].clone()
         Hinv1 = Hinv[i1:i2, i1:i2].contiguous()
         scale_slice = scale_map[:, i1:i2]
@@ -362,13 +361,15 @@ def _quantize_core(
         if W_nz_mask is not None:
             mask_slice = W_nz_mask[:, i1:i2]
 
+        # Quantize the current block
         Q1, Err1, losses1 = _process_block(
             W1, Hinv1, scale_slice, zero_slice, mask_slice, quant_min, quant_max, sym
         )
 
         W[:, i1:i2] = Q1
         losses += losses1.sum(dim=1) / 2
 
+        # Propagate block error to remaining unprocessed columns
         w_err = Err1 @ Hinv[i1:i2, i2:]
         if W_nz_mask is not None:
             mask_rest = W_nz_mask[:, i2:]
@@ -379,7 +380,7 @@ def _quantize_core(
     return W, losses
 
 
-def quantize_weight_optimized(
+def quantize_weight(
     module: torch.nn.Module,
     quant_args: QuantizationArgs,
     hessians_dict: Dict[torch.nn.Module, torch.Tensor],