pytorch
diff --git a/‎test/quantization/model.py
Lines changed: 12 additions & 4 deletions b/‎test/quantization/model.py
Lines changed: 12 additions & 4 deletions
diff --git a/‎test/quantization/test_quant_api.py
Lines changed: 72 additions & 9 deletions b/‎test/quantization/test_quant_api.py
Lines changed: 72 additions & 9 deletions
@@ -11,6 +11,16 @@
 from torch import Tensor
 from torch.nn import functional as F
 
+def prepare_inputs_for_model(inps):
+    # setup inputs in correct format
+    max_new_tokens = 1
+    T = inps.size(0)
+    T_new = T + max_new_tokens
+    seq = torch.empty(T_new, dtype=inps.dtype, device=inps.device)
+    seq[:T] = inps
+    input_pos = torch.arange(0, T, device=inps.device)
+    x = seq.index_select(0, input_pos).view(1, -1)
+    return (x, input_pos)
 
 def find_multiple(n: int, k: int) -> int:
     if n % k == 0:
@@ -76,10 +86,8 @@ def update(self, input_pos, k_val, v_val):
         # input_pos: [S], k_val: [B, H, S, D]
         assert input_pos.shape[0] == k_val.shape[2]
 
-        k_out = self.k_cache
-        v_out = self.v_cache
-        k_out[:, :, input_pos] = k_val
-        v_out[:, :, input_pos] = v_val
+        k_out = torch.ops.aten.index_put_(self.k_cache, [None, None, input_pos], k_val)
+        v_out = torch.ops.aten.index_put_(self.v_cache, [None, None, input_pos], v_val)
 
         return k_out, v_out
 
 
@@ -29,7 +29,7 @@
 )
 from pathlib import Path
 from sentencepiece import SentencePieceProcessor
-from model import Transformer
+from model import Transformer, prepare_inputs_for_model
 
 
 def dynamic_quant(model, example_inputs):
@@ -139,9 +139,9 @@ def test_dynamic_quant_gpu_unified_api_eager_mode_impl(self):
     @unittest.skipIf(not TORCH_VERSION_AFTER_2_3, "skipping when torch verion is 2.3 or lower")
     def test_8da4w_quantizer(self):
         from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
-        from torchao.quantization.quant_api import Int8DynActInt4WeightLinear
+        from torchao.quantization.GPTQ import Int8DynActInt4WeightLinear
 
-        quantizer = Int8DynActInt4WeightQuantizer(group_size=32)
+        quantizer = Int8DynActInt4WeightQuantizer(groupsize=32)
         m = M().eval()
         example_inputs = m.example_inputs()
         m = quantizer.quantize(m)
@@ -151,7 +151,7 @@ def test_8da4w_quantizer(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_gptq_quantizer(self):
-        from torchao.quantization.quant_api import Int8DynActInt4WeightGPTQQuantizer
+        from torchao.quantization.GPTQ import Int8DynActInt4WeightGPTQQuantizer, InputRecorder
         # should be similar to TorchCompileDynamicQuantizer
         precision = torch.bfloat16
         device = "cpu"
@@ -169,20 +169,83 @@ def test_gptq_quantizer(self):
         percdamp = 0.01
         groupsize = 128
         calibration_tasks = ["wikitext"]
-        calibration_limit = 5
+        calibration_limit = 1
         calibration_seq_length = 100
+        input_prep_func = prepare_inputs_for_model
         pad_calibration_inputs = False
-        quantizer = Int8DynActInt4WeightGPTQQuantizer(
+
+        inputs = InputRecorder(
             tokenizer,
+            calibration_seq_length,
+            input_prep_func,
+            pad_calibration_inputs,
+            model.config.vocab_size,
+        ).record_inputs(
+            calibration_tasks,
+            calibration_limit,
+        ).get_inputs()
+
+        quantizer = Int8DynActInt4WeightGPTQQuantizer(
             blocksize,
             percdamp,
             groupsize,
-            calibration_tasks,
-            calibration_limit,
+        )
+        model.setup_caches(max_batch_size=1, max_seq_length=calibration_seq_length)
+        model = quantizer.quantize(model, inputs)
+        compiled = torch.compile(model, mode="max-autotune")
+        with torch.no_grad():
+            compiled(inputs[0].values[0], inputs[1].values[0])
+
+    @unittest.skip("skipping until we get checkpoints for gpt-fast")
+    def test_gptq_quantizer_gpt_fast(self):
+        from torchao.quantization.GPTQ import Int8DynActInt4WeightGPTQQuantizer, InputRecorder
+        # should be similar to TorchCompileDynamicQuantizer
+        precision = torch.bfloat16
+        device = "cuda"
+        checkpoint_path = Path("../gpt-fast/checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+        model = Transformer.from_name(checkpoint_path.parent.name)
+        checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
+        model.load_state_dict(checkpoint, assign=True)
+        model = model.to(dtype=precision, device=device)
+        tokenizer_path = checkpoint_path.parent / "tokenizer.model"
+        assert tokenizer_path.is_file(), tokenizer_path
+        tokenizer = SentencePieceProcessor(  # pyre-ignore[28]
+            model_file=str(tokenizer_path)
+        )
+        blocksize = 128
+        percdamp = 0.01
+        groupsize = 128
+        calibration_tasks = ["wikitext"]
+        calibration_limit = 1
+        calibration_seq_length = 100
+        input_prep_func = prepare_inputs_for_model
+        pad_calibration_inputs = False
+
+        inputs = InputRecorder(
+            tokenizer,
             calibration_seq_length,
+            input_prep_func,
             pad_calibration_inputs,
+            model.config.vocab_size,
+        ).record_inputs(
+            calibration_tasks,
+            calibration_limit,
+        ).get_inputs()
+
+        quantizer = Int8DynActInt4WeightGPTQQuantizer(
+            blocksize,
+            percdamp,
+            groupsize,
+            _is_gpt_fast=True,
+            _use_cuda=True,
         )
-        model = quantizer.quantize(model)
+
+        model.setup_caches(max_batch_size=1, max_seq_length=calibration_seq_length)
+
+        model = quantizer.quantize(model, inputs)
+        compiled = torch.compile(model, mode="max-autotune")
+        with torch.no_grad():
+            compiled(inputs[0].values[0], inputs[1].values[0])
 
 if __name__ == "__main__":
     unittest.main()