From cb0a90c8ec7bd071b3b0dd95d253291865672fb1 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Mon, 3 Mar 2025 23:03:02 +0000
Subject: [PATCH 01/11] apply quip recipe

---
 examples/weight_transform.py | 118 +++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 examples/weight_transform.py

diff --git a/examples/weight_transform.py b/examples/weight_transform.py
new file mode 100644
index 000000000..743c1564f
--- /dev/null
+++ b/examples/weight_transform.py
@@ -0,0 +1,118 @@
+from compressed_tensors.transforms import Hadamard, RandomHadamard, Transforms
+from compressed_tensors.transforms.transform_args import (
+    ModuleTarget,
+    TransformationArgs,
+)
+from compressed_tensors.transforms.transform_config import TransformationConfig
+from compressed_tensors.transforms.transform_data import TransformData
+from compressed_tensors.transforms.transform_scheme import TransformationScheme
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+
+ignore = ["re:*.mlp.down_proj$"]
+module_targets = [ModuleTarget.WEIGHTS]
+
+# Start with a processed 
+targets = ["Linear"] # 2048 * 2048
+v_linear_args = TransformationArgs(
+    targets=targets, module_targets=module_targets, ignore=ignore, call_args={"transpose": True, "first": False}
+)
+
+targets = ["re:*.mlp.down_proj$"] # 5632 * 5632
+v_down_proj = TransformationArgs(
+    targets=targets, module_targets=module_targets, call_args={"transpose": True, "first": False}
+)
+
+targets = ["re:*.attn.q_proj$", "re:*.attn.o_proj$", "re:*.mlp.down_proj$"] # 2048 * 2048
+u_q_o_down_proj = TransformationArgs(
+    targets=targets, module_targets=module_targets,
+)
+
+targets = ["re:*.attn.gate_proj$", "re:*.mlp.up_proj$"]  # 5632 * 5632
+u_gate_up_proj = TransformationArgs(
+    targets=targets, module_targets=module_targets,
+)
+
+targets = ["re:*.attn.k_proj$", "re:*.attn.v_proj$"] # 256 * 256
+u_k_v_proj = TransformationArgs(
+    targets=targets, module_targets=module_targets,
+)
+
+
+# This will apply the random_had to the first set of args
+# It will then apply the second set of args
+# any overalp will be applied in order
+v_scheme = TransformationScheme(
+    transform_type="random-hadamard",
+    groups=[v_linear_args],
+    transform_creation_args={"size": 2048},
+)
+
+v_scheme_down_proj = TransformationScheme(
+    transform_type="random-hadamard",
+    groups=[v_down_proj],
+    transform_creation_args={"size": 5632},
+)
+
+u_scheme_q_o_down_proj = TransformationScheme(
+    transform_type="random-hadamard",
+    groups=[u_q_o_down_proj],
+    transform_creation_args={"size": 2048},
+)
+
+u_scheme_gate_up_proj = TransformationScheme(
+    transform_type="random-hadamard",
+    groups=[u_gate_up_proj],
+    transform_creation_args={"size": 5632},
+)
+
+u_scheme_k_v_proj = TransformationScheme(
+    transform_type="random-hadamard",
+    groups=[u_k_v_proj],
+    transform_creation_args={"size": 256},
+)
+
+# QuIP Recipe with weight only quantization
+config = TransformationConfig(
+    transform_groups={
+        "u_transform_q_o_down_proj": u_scheme_q_o_down_proj,
+        "u_transform_gate_up_proj": u_scheme_gate_up_proj,
+        "u_transform_k_v_proj": u_scheme_k_v_proj,
+        "v_transform_linear": v_scheme,
+        "v_transform_down_proj": v_scheme_down_proj
+    }
+)
+
+#MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+x = model.model.layers[0]
+attn = x.self_attn
+mlp = x.mlp
+
+layers = [
+    attn.q_proj,
+    attn.k_proj,
+    attn.v_proj,
+    attn.o_proj,
+    mlp.gate_proj,
+    mlp.down_proj,
+    mlp.up_proj
+]
+
+for layer in layers:
+
+    current_weight = layer.weight
+    (n, m) = current_weight.shape
+    U = torch.eye(n).to("cuda").to(torch.bfloat16)
+    V = torch.eye(m).to("cuda").to(torch.bfloat16)
+    print(n, layer)
+
+    output = torch.matmul(U, current_weight)
+    output = torch.matmul(output, V.T)

From 9435d811bceb40610088bef5db21b5250cdc91af Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Mon, 3 Mar 2025 23:03:53 +0000
Subject: [PATCH 02/11] update

---
 examples/weight_transform.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/weight_transform.py b/examples/weight_transform.py
index 743c1564f..83e54920a 100644
--- a/examples/weight_transform.py
+++ b/examples/weight_transform.py
@@ -54,6 +54,7 @@
     transform_creation_args={"size": 5632},
 )
 
+# We could combine multiple args to the same scheme but then would make it more difficult to consolidate order of transforms
 u_scheme_q_o_down_proj = TransformationScheme(
     transform_type="random-hadamard",
     groups=[u_q_o_down_proj],

From a7bf319bb84e5d3888d07e166b721df179bbb1e7 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Thu, 6 Mar 2025 01:40:25 +0000
Subject: [PATCH 03/11] update

---
 .../modifiers/quantization/calibration.py     | 23 +++++++++++++++++++
 .../quantization/quantization/base.py         |  6 ++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
index bcb4b7433..3245c8604 100644
--- a/src/llmcompressor/modifiers/quantization/calibration.py
+++ b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -120,8 +120,31 @@ def update_weight_zp_scale(module: Module):
 
     if module.quantization_scheme.weights is not None:
         # set weight scale and zero_point up front, calibration data doesn't affect it
+
+        transform_data = getattr(module, "transform_data", None)
+        if transform_data is not None:
+            # order that the transforms were added to match the order they should be applied
+            untransformed_weight = module.weight.data.clone()
+            for transform_name, transform_values in transform_data.data.items():
+                transform = getattr(module, transform_name)
+                apply = transform_values.get("apply")
+                call_args = transform_values.get("call_args")
+                if call_args:
+                    transformed_weight = apply(
+                        input_tensor=module.weight, transform=transform, **call_args
+                    )
+                else:
+                    transformed_weight = apply(
+                        input_tensor=module.weight, transform=transform
+                    )
+                module.weight.data.copy_(transformed_weight)
+
         call_observer(module=module, base_name="weight")
 
+        # TODO: what do we do here?
+        if transform_data is not None:
+            module.weight.data.copy_(untransformed_weight)
+
 
 def calibrate_activations(module: Module, value: torch.Tensor, base_name: str):
     """
diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py
index 3a8946aef..f370a79fd 100644
--- a/src/llmcompressor/modifiers/quantization/quantization/base.py
+++ b/src/llmcompressor/modifiers/quantization/quantization/base.py
@@ -10,6 +10,7 @@
     is_preset_scheme,
     preset_name_to_scheme,
 )
+from compressed_tensors.transforms.transform_config import TransformationConfig
 from loguru import logger
 from pydantic import Field, field_validator
 from torch.nn import Module
@@ -74,6 +75,7 @@ class QuantizationModifier(Modifier):
     """
 
     config_groups: Optional[Dict[str, QuantizationScheme]] = None
+    transforms_config: Optional[TransformationConfig] = None
     ignore: List[str] = Field(default_factory=list)
     targets: Union[str, List[str]] = Field(default_factory=lambda: ["Linear"])
     scheme: Optional[Union[str, Dict[str, Any]]] = None
@@ -210,7 +212,9 @@ def _check_calibration_data(self, config: QuantizationConfig):
     def _apply_modifier_to_model(self, model: Module):
         modifier_as_config = self.create_init_config()
         # Add step to attach kv_cache to the model, if present within the config
-        apply_quantization_config(model, modifier_as_config)
+        apply_quantization_config(
+            model, modifier_as_config, transforms_config=self.transforms_config
+        )
         model.apply(set_unset_kv_cache)
         return modifier_as_config
 

From 4a58bb1520c4b436db0e01bca5837870aabd627b Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Thu, 6 Mar 2025 23:26:31 +0000
Subject: [PATCH 04/11] clean-up

---
 examples/weight_transform.py                  | 121 +++++++++++++-----
 .../modifiers/quantization/calibration.py     |  21 +--
 2 files changed, 97 insertions(+), 45 deletions(-)

diff --git a/examples/weight_transform.py b/examples/weight_transform.py
index 83e54920a..c980d1945 100644
--- a/examples/weight_transform.py
+++ b/examples/weight_transform.py
@@ -1,41 +1,62 @@
+import torch
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationScheme,
+    QuantizationStrategy,
+)
 from compressed_tensors.transforms import Hadamard, RandomHadamard, Transforms
 from compressed_tensors.transforms.transform_args import (
     ModuleTarget,
     TransformationArgs,
 )
 from compressed_tensors.transforms.transform_config import TransformationConfig
-from compressed_tensors.transforms.transform_data import TransformData
 from compressed_tensors.transforms.transform_scheme import TransformationScheme
 from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
 
-ignore = ["re:*.mlp.down_proj$"]
-module_targets = [ModuleTarget.WEIGHTS]
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+# U(W)V.T
+
+ignore = ["re:.*.mlp.down_proj$"]
+module_targets = [ModuleTarget.WEIGHT.value]
 
-# Start with a processed 
-targets = ["Linear"] # 2048 * 2048
+# Start with a processed
+targets = ["Linear"]  # 2048 * 2048
 v_linear_args = TransformationArgs(
-    targets=targets, module_targets=module_targets, ignore=ignore, call_args={"transpose": True, "first": False}
+    targets=targets,
+    module_targets=module_targets,
+    ignore=ignore,
+    call_args={"transpose": True, "first": False},
 )
 
-targets = ["re:*.mlp.down_proj$"] # 5632 * 5632
+targets = ["re:.*.mlp.down_proj$"]  # 8192 * 8192
 v_down_proj = TransformationArgs(
-    targets=targets, module_targets=module_targets, call_args={"transpose": True, "first": False}
+    targets=targets,
+    module_targets=module_targets,
+    call_args={"transpose": True, "first": False},
 )
 
-targets = ["re:*.attn.q_proj$", "re:*.attn.o_proj$", "re:*.mlp.down_proj$"] # 2048 * 2048
+targets = [
+    "re:.*.attn.q_proj$",
+    "re:.*.attn.o_proj$",
+    "re:.*.mlp.down_proj$",
+]  # 2048 * 2048
 u_q_o_down_proj = TransformationArgs(
-    targets=targets, module_targets=module_targets,
+    targets=targets,
+    module_targets=module_targets,
 )
 
-targets = ["re:*.attn.gate_proj$", "re:*.mlp.up_proj$"]  # 5632 * 5632
+targets = ["re:.*.mlp.gate_proj$", "re:.*.mlp.up_proj$"]  # 8192 * 8192
 u_gate_up_proj = TransformationArgs(
-    targets=targets, module_targets=module_targets,
+    targets=targets,
+    module_targets=module_targets,
 )
 
-targets = ["re:*.attn.k_proj$", "re:*.attn.v_proj$"] # 256 * 256
+targets = ["re:.*.attn.k_proj$", "re:.*.attn.v_proj$"]  # 512 * 512
 u_k_v_proj = TransformationArgs(
-    targets=targets, module_targets=module_targets,
+    targets=targets,
+    module_targets=module_targets,
 )
 
 
@@ -51,7 +72,7 @@
 v_scheme_down_proj = TransformationScheme(
     transform_type="random-hadamard",
     groups=[v_down_proj],
-    transform_creation_args={"size": 5632},
+    transform_creation_args={"size": 8192},
 )
 
 # We could combine multiple args to the same scheme but then would make it more difficult to consolidate order of transforms
@@ -64,35 +85,65 @@
 u_scheme_gate_up_proj = TransformationScheme(
     transform_type="random-hadamard",
     groups=[u_gate_up_proj],
-    transform_creation_args={"size": 5632},
+    transform_creation_args={"size": 8192},
 )
 
 u_scheme_k_v_proj = TransformationScheme(
     transform_type="random-hadamard",
     groups=[u_k_v_proj],
-    transform_creation_args={"size": 256},
+    transform_creation_args={"size": 512},
 )
 
 # QuIP Recipe with weight only quantization
 config = TransformationConfig(
     transform_groups={
         "u_transform_q_o_down_proj": u_scheme_q_o_down_proj,
-        "u_transform_gate_up_proj": u_scheme_gate_up_proj,
         "u_transform_k_v_proj": u_scheme_k_v_proj,
+        "u_transform_gate_up_proj": u_scheme_gate_up_proj,
         "v_transform_linear": v_scheme,
-        "v_transform_down_proj": v_scheme_down_proj
+        "v_transform_down_proj": v_scheme_down_proj,
     }
 )
 
-#MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+recipe = QuantizationModifier(
+    targets="Linear",
+    ignore=["lm_head"],
+    config_groups={
+        "group_0": QuantizationScheme(
+            targets=["Linear"],
+            weights=QuantizationArgs(
+                num_bits=4,
+                symmetric=True,
+                strategy=QuantizationStrategy.GROUP,
+                group_size=128,
+            ),
+        )
+    },
+    transforms_config=config,
+)
+
+MODEL_ID = "meta-llama/Llama-3.2-1B"
 
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-    torch_dtype="auto",
+    MODEL_ID, device_map="auto", torch_dtype="auto"
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+oneshot(model=model, recipe=recipe)
+
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-Transforms"
+model.save_pretrained(SAVE_DIR)
+tokenizer.save_pretrained(SAVE_DIR)
+
+"""
 x = model.model.layers[0]
 attn = x.self_attn
 mlp = x.mlp
@@ -104,16 +155,26 @@
     attn.o_proj,
     mlp.gate_proj,
     mlp.down_proj,
-    mlp.up_proj
+    mlp.up_proj,
 ]
 
-for layer in layers:
+from compressed_tensors.transforms.hadamard_utils import (
+    deterministic_hadamard_matrix,
+    random_hadamard_matrix,
+)
 
+for layer in layers:
     current_weight = layer.weight
+    original_weight = current_weight.data.clone()
     (n, m) = current_weight.shape
-    U = torch.eye(n).to("cuda").to(torch.bfloat16)
-    V = torch.eye(m).to("cuda").to(torch.bfloat16)
-    print(n, layer)
+
+    U = torch.Tensor(random_hadamard_matrix(n)).to("cuda").to(torch.float32)
+    V = torch.Tensor(random_hadamard_matrix(m)).to("cuda").to(torch.float32)
 
     output = torch.matmul(U, current_weight)
     output = torch.matmul(output, V.T)
+
+    # apply untransform
+    x = torch.matmul(U.T, torch.matmul(output, V))
+    print(torch.max(abs(x - original_weight)))
+"""
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
index 3245c8604..f20951b88 100644
--- a/src/llmcompressor/modifiers/quantization/calibration.py
+++ b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -4,6 +4,7 @@
 from compressed_tensors.quantization import QuantizationStatus, is_attention_module
 from compressed_tensors.quantization.lifecycle.forward import forward_quantize
 from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme
+from compressed_tensors.transforms.apply import apply_transforms_to_parameter
 from compressed_tensors.utils.offload import is_module_offloaded, update_parameter_data
 from loguru import logger
 from torch.nn import Module
@@ -123,25 +124,15 @@ def update_weight_zp_scale(module: Module):
 
         transform_data = getattr(module, "transform_data", None)
         if transform_data is not None:
-            # order that the transforms were added to match the order they should be applied
             untransformed_weight = module.weight.data.clone()
-            for transform_name, transform_values in transform_data.data.items():
-                transform = getattr(module, transform_name)
-                apply = transform_values.get("apply")
-                call_args = transform_values.get("call_args")
-                if call_args:
-                    transformed_weight = apply(
-                        input_tensor=module.weight, transform=transform, **call_args
-                    )
-                else:
-                    transformed_weight = apply(
-                        input_tensor=module.weight, transform=transform
-                    )
-                module.weight.data.copy_(transformed_weight)
+            apply_transforms_to_parameter(
+                module=module,
+                module_parameter=module.weight,
+                transform_data=transform_data,
+            )
 
         call_observer(module=module, base_name="weight")
 
-        # TODO: what do we do here?
         if transform_data is not None:
             module.weight.data.copy_(untransformed_weight)
 

From 744a31122eac22c0da4e97afaa41b32af309f2fc Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 7 Mar 2025 23:48:00 +0000
Subject: [PATCH 05/11] update

---
 examples/weight_transform.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/weight_transform.py b/examples/weight_transform.py
index c980d1945..84c1f1cee 100644
--- a/examples/weight_transform.py
+++ b/examples/weight_transform.py
@@ -18,7 +18,7 @@
 
 # U(W)V.T
 
-ignore = ["re:.*.mlp.down_proj$"]
+ignore = ["re:.*.mlp.down_proj$", "lm_head"]
 module_targets = [ModuleTarget.WEIGHT.value]
 
 # Start with a processed
@@ -140,6 +140,7 @@
 
 # Save to disk compressed.
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-Transforms"
+
 model.save_pretrained(SAVE_DIR)
 tokenizer.save_pretrained(SAVE_DIR)
 

From 3084f39b588eb33b9a46e957190409a91818ddc3 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Mon, 10 Mar 2025 16:38:23 +0000
Subject: [PATCH 06/11] update

---
 examples/weight_transform.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/weight_transform.py b/examples/weight_transform.py
index 84c1f1cee..731f50449 100644
--- a/examples/weight_transform.py
+++ b/examples/weight_transform.py
@@ -15,7 +15,6 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-
 # U(W)V.T
 
 ignore = ["re:.*.mlp.down_proj$", "lm_head"]
@@ -64,32 +63,32 @@
 # It will then apply the second set of args
 # any overalp will be applied in order
 v_scheme = TransformationScheme(
-    transform_type="random-hadamard",
+    transform_type="hadamard",
     groups=[v_linear_args],
     transform_creation_args={"size": 2048},
 )
 
 v_scheme_down_proj = TransformationScheme(
-    transform_type="random-hadamard",
+    transform_type="hadamard",
     groups=[v_down_proj],
     transform_creation_args={"size": 8192},
 )
 
 # We could combine multiple args to the same scheme but then would make it more difficult to consolidate order of transforms
 u_scheme_q_o_down_proj = TransformationScheme(
-    transform_type="random-hadamard",
+    transform_type="hadamard",
     groups=[u_q_o_down_proj],
     transform_creation_args={"size": 2048},
 )
 
 u_scheme_gate_up_proj = TransformationScheme(
-    transform_type="random-hadamard",
+    transform_type="hadamard",
     groups=[u_gate_up_proj],
     transform_creation_args={"size": 8192},
 )
 
 u_scheme_k_v_proj = TransformationScheme(
-    transform_type="random-hadamard",
+    transform_type="hadamard",
     groups=[u_k_v_proj],
     transform_creation_args={"size": 512},
 )
@@ -116,13 +115,14 @@
                 symmetric=True,
                 strategy=QuantizationStrategy.GROUP,
                 group_size=128,
+                observer="mse"
             ),
         )
     },
     transforms_config=config,
 )
 
-MODEL_ID = "meta-llama/Llama-3.2-1B"
+MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
 
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID, device_map="auto", torch_dtype="auto"
@@ -139,9 +139,9 @@
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-Transforms"
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-uncompressed-hadamard-random"
 
-model.save_pretrained(SAVE_DIR)
+model.save_pretrained(SAVE_DIR, save_compressed=False)
 tokenizer.save_pretrained(SAVE_DIR)
 
 """

From 919be473e5eeb5acbc63d49bf347e6a730bd7524 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Tue, 11 Mar 2025 22:06:38 +0000
Subject: [PATCH 07/11] remove test script

---
 examples/weight_transform.py | 181 -----------------------------------
 1 file changed, 181 deletions(-)
 delete mode 100644 examples/weight_transform.py

diff --git a/examples/weight_transform.py b/examples/weight_transform.py
deleted file mode 100644
index 731f50449..000000000
--- a/examples/weight_transform.py
+++ /dev/null
@@ -1,181 +0,0 @@
-import torch
-from compressed_tensors.quantization import (
-    QuantizationArgs,
-    QuantizationScheme,
-    QuantizationStrategy,
-)
-from compressed_tensors.transforms import Hadamard, RandomHadamard, Transforms
-from compressed_tensors.transforms.transform_args import (
-    ModuleTarget,
-    TransformationArgs,
-)
-from compressed_tensors.transforms.transform_config import TransformationConfig
-from compressed_tensors.transforms.transform_scheme import TransformationScheme
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-# U(W)V.T
-
-ignore = ["re:.*.mlp.down_proj$", "lm_head"]
-module_targets = [ModuleTarget.WEIGHT.value]
-
-# Start with a processed
-targets = ["Linear"]  # 2048 * 2048
-v_linear_args = TransformationArgs(
-    targets=targets,
-    module_targets=module_targets,
-    ignore=ignore,
-    call_args={"transpose": True, "first": False},
-)
-
-targets = ["re:.*.mlp.down_proj$"]  # 8192 * 8192
-v_down_proj = TransformationArgs(
-    targets=targets,
-    module_targets=module_targets,
-    call_args={"transpose": True, "first": False},
-)
-
-targets = [
-    "re:.*.attn.q_proj$",
-    "re:.*.attn.o_proj$",
-    "re:.*.mlp.down_proj$",
-]  # 2048 * 2048
-u_q_o_down_proj = TransformationArgs(
-    targets=targets,
-    module_targets=module_targets,
-)
-
-targets = ["re:.*.mlp.gate_proj$", "re:.*.mlp.up_proj$"]  # 8192 * 8192
-u_gate_up_proj = TransformationArgs(
-    targets=targets,
-    module_targets=module_targets,
-)
-
-targets = ["re:.*.attn.k_proj$", "re:.*.attn.v_proj$"]  # 512 * 512
-u_k_v_proj = TransformationArgs(
-    targets=targets,
-    module_targets=module_targets,
-)
-
-
-# This will apply the random_had to the first set of args
-# It will then apply the second set of args
-# any overalp will be applied in order
-v_scheme = TransformationScheme(
-    transform_type="hadamard",
-    groups=[v_linear_args],
-    transform_creation_args={"size": 2048},
-)
-
-v_scheme_down_proj = TransformationScheme(
-    transform_type="hadamard",
-    groups=[v_down_proj],
-    transform_creation_args={"size": 8192},
-)
-
-# We could combine multiple args to the same scheme but then would make it more difficult to consolidate order of transforms
-u_scheme_q_o_down_proj = TransformationScheme(
-    transform_type="hadamard",
-    groups=[u_q_o_down_proj],
-    transform_creation_args={"size": 2048},
-)
-
-u_scheme_gate_up_proj = TransformationScheme(
-    transform_type="hadamard",
-    groups=[u_gate_up_proj],
-    transform_creation_args={"size": 8192},
-)
-
-u_scheme_k_v_proj = TransformationScheme(
-    transform_type="hadamard",
-    groups=[u_k_v_proj],
-    transform_creation_args={"size": 512},
-)
-
-# QuIP Recipe with weight only quantization
-config = TransformationConfig(
-    transform_groups={
-        "u_transform_q_o_down_proj": u_scheme_q_o_down_proj,
-        "u_transform_k_v_proj": u_scheme_k_v_proj,
-        "u_transform_gate_up_proj": u_scheme_gate_up_proj,
-        "v_transform_linear": v_scheme,
-        "v_transform_down_proj": v_scheme_down_proj,
-    }
-)
-
-recipe = QuantizationModifier(
-    targets="Linear",
-    ignore=["lm_head"],
-    config_groups={
-        "group_0": QuantizationScheme(
-            targets=["Linear"],
-            weights=QuantizationArgs(
-                num_bits=4,
-                symmetric=True,
-                strategy=QuantizationStrategy.GROUP,
-                group_size=128,
-                observer="mse"
-            ),
-        )
-    },
-    transforms_config=config,
-)
-
-MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-oneshot(model=model, recipe=recipe)
-
-print("\n\n")
-print("========== SAMPLE GENERATION ==============")
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=100)
-print(tokenizer.decode(output[0]))
-print("==========================================\n\n")
-
-# Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-uncompressed-hadamard-random"
-
-model.save_pretrained(SAVE_DIR, save_compressed=False)
-tokenizer.save_pretrained(SAVE_DIR)
-
-"""
-x = model.model.layers[0]
-attn = x.self_attn
-mlp = x.mlp
-
-layers = [
-    attn.q_proj,
-    attn.k_proj,
-    attn.v_proj,
-    attn.o_proj,
-    mlp.gate_proj,
-    mlp.down_proj,
-    mlp.up_proj,
-]
-
-from compressed_tensors.transforms.hadamard_utils import (
-    deterministic_hadamard_matrix,
-    random_hadamard_matrix,
-)
-
-for layer in layers:
-    current_weight = layer.weight
-    original_weight = current_weight.data.clone()
-    (n, m) = current_weight.shape
-
-    U = torch.Tensor(random_hadamard_matrix(n)).to("cuda").to(torch.float32)
-    V = torch.Tensor(random_hadamard_matrix(m)).to("cuda").to(torch.float32)
-
-    output = torch.matmul(U, current_weight)
-    output = torch.matmul(output, V.T)
-
-    # apply untransform
-    x = torch.matmul(U.T, torch.matmul(output, V))
-    print(torch.max(abs(x - original_weight)))
-"""

From 82686a56a1b21722a14caf155aec1bb5e3e657f7 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 3 Apr 2025 21:38:23 +0000
Subject: [PATCH 08/11] update

---
 .../modifiers/quantization/calibration.py     | 25 ++++++----
 .../quantization/quantization/base.py         | 50 +++++++++++++++----
 src/llmcompressor/utils/helpers.py            |  1 -
 3 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
index f20951b88..23a987fd4 100644
--- a/src/llmcompressor/modifiers/quantization/calibration.py
+++ b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -2,9 +2,8 @@
 
 import torch
 from compressed_tensors.quantization import QuantizationStatus, is_attention_module
-from compressed_tensors.quantization.lifecycle.forward import forward_quantize
 from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme
-from compressed_tensors.transforms.apply import apply_transforms_to_parameter
+from compressed_tensors.transforms.apply import apply_transforms_to_activations_or_parameter
 from compressed_tensors.utils.offload import is_module_offloaded, update_parameter_data
 from loguru import logger
 from torch.nn import Module
@@ -125,9 +124,9 @@ def update_weight_zp_scale(module: Module):
         transform_data = getattr(module, "transform_data", None)
         if transform_data is not None:
             untransformed_weight = module.weight.data.clone()
-            apply_transforms_to_parameter(
+            apply_transforms_to_activations_or_parameter(
                 module=module,
-                module_parameter=module.weight,
+                module_activation_or_parameter=module.weight,
                 transform_data=transform_data,
             )
 
@@ -152,11 +151,22 @@ def calibrate_activations(module: Module, value: torch.Tensor, base_name: str):
     if value.numel() == 0:
         return
 
+    transform_data = getattr(module, "transform_data", None)
+    if transform_data is not None:
+        value = apply_transforms_to_activations_or_parameter(
+            module=module,
+            module_activation_or_parameter=value,
+            transform_data=transform_data,
+            update_in_place=False
+        )
+
     call_observer(
         module=module,
         base_name=base_name,
         value=value,
     )
+    breakpoint()
+    # validate value is correct
 
 
 def calibrate_input_hook(module: Module, args: Any):
@@ -180,12 +190,6 @@ def calibrate_output_hook(module: Module, _args: Any, output: torch.Tensor):
         value=output,
         base_name="output",
     )
-    output = forward_quantize(
-        module=module,
-        value=output,
-        base_name="output",
-        args=module.quantization_scheme.output_activations,
-    )
     return output
 
 
@@ -211,7 +215,6 @@ def calibrate_kv_cache_output_hook(module: Module, _args: Any, _output: torch.Te
     update_parameter_data(module, kv_cache.k_scales[module.layer_idx], "k_scale")
     update_parameter_data(module, kv_cache.v_scales[module.layer_idx], "v_scale")
 
-
 def set_unset_kv_cache(module: Module):
     """
     Set or unset singleton QuantizedKVParameterCache for each
diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py
index f370a79fd..b514d0c0b 100644
--- a/src/llmcompressor/modifiers/quantization/quantization/base.py
+++ b/src/llmcompressor/modifiers/quantization/quantization/base.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Set
 
 from compressed_tensors.quantization import (
     QuantizationArgs,
@@ -10,6 +10,11 @@
     is_preset_scheme,
     preset_name_to_scheme,
 )
+from compressed_tensors.quantization.lifecycle import (
+    post_forward_quantize,
+    pre_forward_quantize,
+    register_quantization_hooks,
+)
 from compressed_tensors.transforms.transform_config import TransformationConfig
 from loguru import logger
 from pydantic import Field, field_validator
@@ -85,6 +90,7 @@ class QuantizationModifier(Modifier):
 
     calibration_dataloader_: Any = None
     calibration_function_: Any = None
+    _handles: Set = set()
 
     @field_validator("targets", mode="before")
     def validate_targets(cls, value: Union[str, List[str]]) -> List[str]:
@@ -213,7 +219,10 @@ def _apply_modifier_to_model(self, model: Module):
         modifier_as_config = self.create_init_config()
         # Add step to attach kv_cache to the model, if present within the config
         apply_quantization_config(
-            model, modifier_as_config, transforms_config=self.transforms_config
+            model,
+            modifier_as_config,
+            transforms_config=self.transforms_config,
+            delay_forward_quantize=True,
         )
         model.apply(set_unset_kv_cache)
         return modifier_as_config
@@ -262,6 +271,9 @@ def _calibrate_if_possible(self, module: Module):
             )
 
         elif not self.calibration_dataloader_:
+            # TODO: should just use HooksMixin
+            # hooks should have been delayed 
+            module.apply(lambda model: register_quantization_hooks(model))
             return
 
         module.apply(lambda model: initialize_observer(model, base_name="input"))
@@ -269,7 +281,7 @@ def _calibrate_if_possible(self, module: Module):
         module.apply(self.register_calibration_hooks)
         self._calibrate(module)
         module.apply(set_unset_kv_cache)
-        self.remove_hooks()
+        self.remove_hooks(self._handles)
 
     def register_calibration_hooks(self, module: Module):
         """
@@ -289,23 +301,39 @@ def register_calibration_hooks(self, module: Module):
 
         # Calibrate inputs if an input_quant is provided and not running dynamic quant
         if calibrate_inputs:
-            self.register_hook(module, calibrate_input_hook, "forward_pre")
+            self._handles.add(
+                self.register_hook(module, calibrate_input_hook, "forward_pre")
+            )
+
+        if not is_attention_module_:
+            self.register_hook(module, pre_forward_quantize, "forward_pre")
 
         if output_quant:
             # hooks for attn modules if running kv_cache quant
             if is_attention_module_:
-                self.register_hook(
-                    module,
-                    calibrate_kv_cache_input_hook,
-                    "forward_pre",
-                    with_kwargs=True,
+                self._handles.add(
+                    self.register_hook(
+                        module,
+                        calibrate_kv_cache_input_hook,
+                        "forward_pre",
+                        with_kwargs=True,
+                    )
                 )
 
-                self.register_hook(module, calibrate_kv_cache_output_hook, "forward")
+                self._handles.add(
+                    self.register_hook(
+                        module, calibrate_kv_cache_output_hook, "forward"
+                    )
+                )
 
             # hooks for output quant if not running dynamic quant
             elif not output_quant.dynamic:
-                self.register_hook(module, calibrate_output_hook, "forward")
+                self._handles.add(
+                    self.register_hook(module, calibrate_output_hook, "forward")
+                )
+        
+        if not is_attention_module_:
+            self.register_hook(module, post_forward_quantize, "forward")
 
     def _calibrate(self, module: Module):
         class_name = self.__class__.__name__.replace("PyTorch", "")
diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py
index 75fad8311..6c5919652 100644
--- a/src/llmcompressor/utils/helpers.py
+++ b/src/llmcompressor/utils/helpers.py
@@ -1125,7 +1125,6 @@ def calibration_forward_context(model: PreTrainedModel):
     with (
         torch.no_grad(),
         DisableKVCache(model),
-        DisableQuantization(model),
         eval_context(model),
     ):
         yield

From a06da00f6609fee7c59ef4c0405459f38bc66583 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Mon, 5 May 2025 21:04:55 +0000
Subject: [PATCH 09/11] add script to apply weight-only transformers

---
 weight_transform.py | 145 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 weight_transform.py

diff --git a/weight_transform.py b/weight_transform.py
new file mode 100644
index 000000000..74b61078f
--- /dev/null
+++ b/weight_transform.py
@@ -0,0 +1,145 @@
+import torch
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationScheme,
+    QuantizationStrategy,
+)
+from compressed_tensors.transforms import Hadamard, RandomHadamard, Transforms
+from compressed_tensors.transforms.transform_args import (
+    ModuleTarget,
+    TransformationArgs,
+)
+from compressed_tensors.transforms.transform_config import TransformationConfig
+from compressed_tensors.transforms.transform_scheme import TransformationScheme
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+# U(W)V.T
+
+ignore = ["re:.*.mlp.down_proj$", "lm_head"]
+module_targets = [ModuleTarget.WEIGHT.value]
+
+# Start with a processed
+targets = ["Linear"]  # 2048 * 2048
+v_linear_args = TransformationArgs(
+    targets=targets,
+    module_targets=module_targets,
+    ignore=ignore,
+    call_args={"transpose": True, "first": False},
+)
+
+targets = ["re:.*.mlp.down_proj$"]  # 8192 * 8192
+v_down_proj = TransformationArgs(
+    targets=targets,
+    module_targets=module_targets,
+    call_args={"transpose": True, "first": False},
+)
+
+targets = [
+    "re:.*.attn.q_proj$",
+    "re:.*.attn.o_proj$",
+    "re:.*.mlp.down_proj$",
+]  # 2048 * 2048
+u_q_o_down_proj = TransformationArgs(
+    targets=targets,
+    module_targets=module_targets,
+)
+
+targets = ["re:.*.mlp.gate_proj$", "re:.*.mlp.up_proj$"]  # 8192 * 8192
+u_gate_up_proj = TransformationArgs(
+    targets=targets,
+    module_targets=module_targets,
+)
+
+targets = ["re:.*.attn.k_proj$", "re:.*.attn.v_proj$"]  # 512 * 512
+u_k_v_proj = TransformationArgs(
+    targets=targets,
+    module_targets=module_targets,
+)
+
+
+# This will apply the random_had to the first set of args
+# It will then apply the second set of args
+# any overalp will be applied in order
+v_scheme = TransformationScheme(
+    transform_type="hadamard",
+    groups=[v_linear_args],
+    transform_creation_args={"size": 2048},
+)
+
+v_scheme_down_proj = TransformationScheme(
+    transform_type="hadamard",
+    groups=[v_down_proj],
+    transform_creation_args={"size": 8192},
+)
+
+# We could combine multiple args to the same scheme but then would make it more difficult to consolidate order of transforms
+u_scheme_q_o_down_proj = TransformationScheme(
+    transform_type="hadamard",
+    groups=[u_q_o_down_proj],
+    transform_creation_args={"size": 2048},
+)
+
+u_scheme_gate_up_proj = TransformationScheme(
+    transform_type="hadamard",
+    groups=[u_gate_up_proj],
+    transform_creation_args={"size": 8192},
+)
+
+u_scheme_k_v_proj = TransformationScheme(
+    transform_type="hadamard",
+    groups=[u_k_v_proj],
+    transform_creation_args={"size": 512},
+)
+
+# QuIP Recipe with weight only quantization
+config = TransformationConfig(
+    transform_groups={
+        "u_transform_q_o_down_proj": u_scheme_q_o_down_proj,
+        "u_transform_k_v_proj": u_scheme_k_v_proj,
+        "u_transform_gate_up_proj": u_scheme_gate_up_proj,
+        "v_transform_linear": v_scheme,
+        "v_transform_down_proj": v_scheme_down_proj,
+    }
+)
+
+recipe = QuantizationModifier(
+    targets="Linear",
+    ignore=["lm_head"],
+    config_groups={
+        "group_0": QuantizationScheme(
+            targets=["Linear"],
+            weights=QuantizationArgs(
+                num_bits=4,
+                symmetric=True,
+                strategy=QuantizationStrategy.GROUP,
+                group_size=128,
+                observer="mse"
+            ),
+        )
+    },
+    transforms_config=config,
+)
+
+MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+oneshot(model=model, recipe=recipe)
+
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-uncompressed-hadamard-random-debug"
+
+model.save_pretrained(SAVE_DIR, save_compressed=False)
+tokenizer.save_pretrained(SAVE_DIR)
\ No newline at end of file

From 80949dc5db12c7b136e8b9ed60a2c61d4a485175 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Mon, 5 May 2025 21:11:26 +0000
Subject: [PATCH 10/11] add transform loading_script

---
 load_transform_model.py | 111 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 load_transform_model.py

diff --git a/load_transform_model.py b/load_transform_model.py
new file mode 100644
index 000000000..db8f564d7
--- /dev/null
+++ b/load_transform_model.py
@@ -0,0 +1,111 @@
+from pathlib import Path
+
+from safetensors import safe_open
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.utils.quantization_config import CompressedTensorsConfig
+
+MODEL_ID = "Llama-3.2-1B-Instruct-W4A16-uncompressed-hadamard-random-debug"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
+    quantization_config=CompressedTensorsConfig(run_compressed=False),
+)
+breakpoint()
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+
+import lm_eval
+
+results = lm_eval.simple_evaluate(
+    model="hf",
+    model_args={
+        "pretrained": MODEL_ID,
+        "add_bos_token": True,
+        "quantization_config": CompressedTensorsConfig(run_compressed=False),
+    },
+    tasks=["gsm8k"],
+    num_fewshot=8,
+    limit=1000,
+    device="cuda:0",
+    batch_size=100,
+)
+print(results["results"])
+"""
+For: Llama-3.2-1B-Instruct
+
+Dense:
+{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.379,
+ 'exact_match_stderr,strict-match': 0.015349091002225352, 
+ 'exact_match,flexible-extract': 0.381, 
+ 'exact_match_stderr,flexible-extract': 0.015364734787007436}}
+
+----------------------------MINMAX ---------------------------:
+
+QantModifier - NO TRANSFORMS 
+{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.177, 
+'exact_match_stderr,strict-match': 0.011743632866916145, 
+'exact_match,flexible-extract': 0.179, 
+'exact_match_stderr,flexible-extract': 0.0117721103708122}}
+
+QuantModifier - TRANSFORMS (random)
+{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.231, 
+'exact_match_stderr,strict-match': 0.012997843819031815, 
+'exact_match,flexible-extract': 0.236, 
+'exact_match_stderr,flexible-extract': 0.01301973553930782}}
+
+GPTQ
+{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.243, 
+'exact_match_stderr,strict-match': 0.013569640199177434, 
+'exact_match,flexible-extract': 0.244, 
+'exact_match_stderr,flexible-extract': 0.013588548437881431}}
+
+
+---------------------------MSE-----------------------------------:
+QuantModifier - No Transforms
+{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.195,
+'exact_match_stderr,strict-match': 0.012535235623319334, 
+'exact_match,flexible-extract': 0.195,
+ 'exact_match_stderr,flexible-extract': 0.012535235623319334}}
+
+QuantModifier - With Transforms (random)
+{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.243, 
+'exact_match_stderr,strict-match': 0.013569640199177457, 
+'exact_match,flexible-extract': 0.244,
+ 'exact_match_stderr,flexible-extract': 0.013588548437881412}}
+
+QuantModifier - With Transforms (not random, not normalized )
+{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.261, 
+'exact_match_stderr,strict-match': 0.013895037677965126, 
+'exact_match,flexible-extract': 0.262, 
+'exact_match_stderr,flexible-extract': 0.013912208651021352}}
+
+QuantModifier - With Transforms (not random, normalized)
+{'gsm8k': {'alias': 'gsm8k', 
+'exact_match,strict-match': 0.27, 
+'exact_match_stderr,strict-match': 0.014046255632633915, 
+'exact_match,flexible-extract': 0.27,
+ 'exact_match_stderr,flexible-extract': 0.014046255632633915}}
+
+GPTQ:
+{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.285, 
+'exact_match_stderr,strict-match': 0.014282120955200484, 
+'exact_match,flexible-extract': 0.286, 
+'exact_match_stderr,flexible-extract': 0.01429714686251791}}
+
+---------------------8bit----------------------------------:
+QuantModifier - with Transforms (not random, normalized)
+{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.371, 
+'exact_match_stderr,strict-match': 0.015283736211823187,
+'exact_match,flexible-extract': 0.372,
+'exact_match_stderr,flexible-extract': 0.015292149942040577}}
+
+GPTQ
+{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.364, 
+'exact_match_stderr,strict-match': 0.01522286884052202,
+ 'exact_match,flexible-extract': 0.365,
+  'exact_match_stderr,flexible-extract': 0.015231776226264903}}
+"""

From b43b27a2f277a5e62be4f8c713b84fd1c7aa116b Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 6 May 2025 16:45:08 +0000
Subject: [PATCH 11/11] update model

---
 load_transform_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/load_transform_model.py b/load_transform_model.py
index db8f564d7..27d120707 100644
--- a/load_transform_model.py
+++ b/load_transform_model.py
@@ -4,7 +4,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.utils.quantization_config import CompressedTensorsConfig
 
-MODEL_ID = "Llama-3.2-1B-Instruct-W4A16-uncompressed-hadamard-random-debug"
+MODEL_ID = "/home/dsikka/Llama-3.2-1B-Instruct-W4A16-uncompressed-hadamard-random-debug"
 
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,