Fixes for running GPTQ in executorch (#58)

jerryzh168 · facebook-github-bot · commit 331d93953afe · 2024-03-14T15:49:41.000-07:00
Summary: Pull Request resolved: #58 att Reviewed By: cpuhrsch Differential Revision: D54885767 fbshipit-source-id: 331af7c1e6fdb2fc8202f1dc8a34e0a42b1d6314
diff --git a/torchao/quantization/GPTQ.py b/torchao/quantization/GPTQ.py
@@ -93,9 +93,8 @@ def setup_cache_padded_seq_input_pos_max_seq_length_for_prefill(
     input_pos = torch.arange(0, T, device=device)
 
     # no caches in executorch llama2 7b model?
-    print("setting up cache")
-    with torch.device(device):
-        model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
+    # with torch.device(device):
+    #     model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
 
     return seq, input_pos, max_seq_length
 
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -394,6 +394,40 @@ def quantize(
         return model
 
 
+def linear_forward_8da4w(
+    x, weight_int8, scales, zeros, out_features, group_size, precision
+):
+    x = per_token_dynamic_quant(x)
+    # TODO: verify and remove following reshape code
+    # origin_x_size = x.size()
+    # x = x.reshape(-1, origin_x_size[-1])
+
+    # TODO: better API
+    # weight_int8 = torch.ops.quantized_decomposed.unpack_int4_to_int8(weight_int4packed)
+    n_bit = 4
+    quant_min = -(2 ** (n_bit - 1))
+    quant_max = 2 ** (n_bit - 1) - 1
+    w_dq = torch.ops.quantized_decomposed.dequantize_per_channel_group(
+        weight_int8,
+        scales,
+        zeros,
+        quant_min,
+        quant_max,
+        torch.int8,
+        group_size,
+        precision,
+    )
+
+    # x = x.to(torch.float16)
+    # w_dq = w_dq.to(torch.float16)
+    c = torch.nn.functional.linear(x, w_dq)
+
+    # new_shape = origin_x_size[:-1] + (out_features,)
+    # c = c.reshape(new_shape)
+
+    return c
+
+
 class Int8DynActInt4WeightLinear(torch.nn.Module):
     __constants__ = ["in_features", "out_features"]
 
@@ -433,6 +467,7 @@ def __init__(
         self.in_features = in_features
         self.out_features = out_features
         assert not bias, "require bias=False"
+        # TODO: align groupsize naming
         self.group_size = group_size
         # Precision of the activation which also indicates
         # output precision of the dynamically quantized linear layer
@@ -469,10 +504,11 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
             self.scales,
             self.zeros,
             self.out_features,
-            self.groupsize,
+            self.group_size,
             self.precision,
         )
 
+
 from math import gcd
 from functools import reduce
 
@@ -630,7 +666,7 @@ def _convert_for_runtime(self, model):
             model,
             self.groupsize,
             self.padding_allowed,
-            torch.int8,
+            self.precision,
             self.precision,
         )
         return model