[Quantization] Update group quantization (#336)

dsikka · web-flow · commit b2b95b71c418 · 2025-06-02T11:23:52.000-05:00
* update

* fix conditions
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -227,31 +227,42 @@ def _process_quantization(
             perm = torch.argsort(g_idx)
             x = safe_permute(x, perm, dim=1)
 
-        # TODO: experiment with vectorizing for loop for performance
-        end = 0
-        for index, group_count in enumerate(group_sizes):
-            sc = scale[:, index].view(-1, 1)
-            zp = zero_point[:, index].view(-1, 1) if zero_point is not None else None
-
-            start = end
-            end = start + group_count
-            if do_quantize:
-                output[:, start:end] = _quantize(
-                    x=x[:, start:end],
-                    scale=sc,
-                    zero_point=zp,
-                    q_min=q_min,
-                    q_max=q_max,
-                    args=args,
-                    dtype=dtype,
-                    global_scale=global_scale,
-                )
+        x = torch.reshape(
+            x,
+            (
+                x.shape[0],
+                ceil(x.shape[1] / group_size),
+                group_size,
+            ),
+        )
 
-            if do_dequantize:
-                input = output[:, start:end] if do_quantize else x[:, start:end]
-                output[:, start:end] = _dequantize(
-                    x_q=input, scale=sc, zero_point=zp, global_scale=global_scale
-                )
+        if do_quantize:
+            output = _quantize(
+                x=x,
+                scale=scale.unsqueeze(-1),
+                zero_point=zero_point.unsqueeze(-1) if zero_point is not None else None,
+                dtype=dtype,
+                global_scale=global_scale,
+                q_min=q_min,
+                q_max=q_max,
+                args=args,
+            )
+
+        if do_dequantize:
+            input = output if do_quantize else x
+            output = _dequantize(
+                x_q=input,
+                scale=scale.unsqueeze(-1),
+                zero_point=zero_point.unsqueeze(-1) if zero_point is not None else None,
+                global_scale=global_scale,
+            )
+
+        output = torch.reshape(
+            output,
+            (output.shape[0], output.shape[1] * output.shape[2]),
+        )
+
+        output = output.to(output_dtype)
 
         if not is_column_order:
             output = safe_permute(output, torch.argsort(perm), dim=1)