From cb0a90c8ec7bd071b3b0dd95d253291865672fb1 Mon Sep 17 00:00:00 2001 From: Dipika Date: Mon, 3 Mar 2025 23:03:02 +0000 Subject: [PATCH 01/11] apply quip recipe --- examples/weight_transform.py | 118 +++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 examples/weight_transform.py diff --git a/examples/weight_transform.py b/examples/weight_transform.py new file mode 100644 index 000000000..743c1564f --- /dev/null +++ b/examples/weight_transform.py @@ -0,0 +1,118 @@ +from compressed_tensors.transforms import Hadamard, RandomHadamard, Transforms +from compressed_tensors.transforms.transform_args import ( + ModuleTarget, + TransformationArgs, +) +from compressed_tensors.transforms.transform_config import TransformationConfig +from compressed_tensors.transforms.transform_data import TransformData +from compressed_tensors.transforms.transform_scheme import TransformationScheme +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch + +ignore = ["re:*.mlp.down_proj$"] +module_targets = [ModuleTarget.WEIGHTS] + +# Start with a processed +targets = ["Linear"] # 2048 * 2048 +v_linear_args = TransformationArgs( + targets=targets, module_targets=module_targets, ignore=ignore, call_args={"transpose": True, "first": False} +) + +targets = ["re:*.mlp.down_proj$"] # 5632 * 5632 +v_down_proj = TransformationArgs( + targets=targets, module_targets=module_targets, call_args={"transpose": True, "first": False} +) + +targets = ["re:*.attn.q_proj$", "re:*.attn.o_proj$", "re:*.mlp.down_proj$"] # 2048 * 2048 +u_q_o_down_proj = TransformationArgs( + targets=targets, module_targets=module_targets, +) + +targets = ["re:*.attn.gate_proj$", "re:*.mlp.up_proj$"] # 5632 * 5632 +u_gate_up_proj = TransformationArgs( + targets=targets, module_targets=module_targets, +) + +targets = ["re:*.attn.k_proj$", "re:*.attn.v_proj$"] # 256 * 256 +u_k_v_proj = TransformationArgs( + targets=targets, module_targets=module_targets, +) + + +# This will apply the random_had to the first set of args +# It will then apply the second set of args +# any overalp will be applied in order +v_scheme = TransformationScheme( + transform_type="random-hadamard", + groups=[v_linear_args], + transform_creation_args={"size": 2048}, +) + +v_scheme_down_proj = TransformationScheme( + transform_type="random-hadamard", + groups=[v_down_proj], + transform_creation_args={"size": 5632}, +) + +u_scheme_q_o_down_proj = TransformationScheme( + transform_type="random-hadamard", + groups=[u_q_o_down_proj], + transform_creation_args={"size": 2048}, +) + +u_scheme_gate_up_proj = TransformationScheme( + transform_type="random-hadamard", + groups=[u_gate_up_proj], + transform_creation_args={"size": 5632}, +) + +u_scheme_k_v_proj = TransformationScheme( + transform_type="random-hadamard", + groups=[u_k_v_proj], + transform_creation_args={"size": 256}, +) + +# QuIP Recipe with weight only quantization +config = TransformationConfig( + transform_groups={ + "u_transform_q_o_down_proj": u_scheme_q_o_down_proj, + "u_transform_gate_up_proj": u_scheme_gate_up_proj, + "u_transform_k_v_proj": u_scheme_k_v_proj, + "v_transform_linear": v_scheme, + "v_transform_down_proj": v_scheme_down_proj + } +) + +#MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, + device_map="auto", + torch_dtype="auto", +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +x = model.model.layers[0] +attn = x.self_attn +mlp = x.mlp + +layers = [ + attn.q_proj, + attn.k_proj, + attn.v_proj, + attn.o_proj, + mlp.gate_proj, + mlp.down_proj, + mlp.up_proj +] + +for layer in layers: + + current_weight = layer.weight + (n, m) = current_weight.shape + U = torch.eye(n).to("cuda").to(torch.bfloat16) + V = torch.eye(m).to("cuda").to(torch.bfloat16) + print(n, layer) + + output = torch.matmul(U, current_weight) + output = torch.matmul(output, V.T) From 9435d811bceb40610088bef5db21b5250cdc91af Mon Sep 17 00:00:00 2001 From: Dipika Date: Mon, 3 Mar 2025 23:03:53 +0000 Subject: [PATCH 02/11] update --- examples/weight_transform.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/weight_transform.py b/examples/weight_transform.py index 743c1564f..83e54920a 100644 --- a/examples/weight_transform.py +++ b/examples/weight_transform.py @@ -54,6 +54,7 @@ transform_creation_args={"size": 5632}, ) +# We could combine multiple args to the same scheme but then would make it more difficult to consolidate order of transforms u_scheme_q_o_down_proj = TransformationScheme( transform_type="random-hadamard", groups=[u_q_o_down_proj], From a7bf319bb84e5d3888d07e166b721df179bbb1e7 Mon Sep 17 00:00:00 2001 From: Dipika Date: Thu, 6 Mar 2025 01:40:25 +0000 Subject: [PATCH 03/11] update --- .../modifiers/quantization/calibration.py | 23 +++++++++++++++++++ .../quantization/quantization/base.py | 6 ++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py index bcb4b7433..3245c8604 100644 --- a/src/llmcompressor/modifiers/quantization/calibration.py +++ b/src/llmcompressor/modifiers/quantization/calibration.py @@ -120,8 +120,31 @@ def update_weight_zp_scale(module: Module): if module.quantization_scheme.weights is not None: # set weight scale and zero_point up front, calibration data doesn't affect it + + transform_data = getattr(module, "transform_data", None) + if transform_data is not None: + # order that the transforms were added to match the order they should be applied + untransformed_weight = module.weight.data.clone() + for transform_name, transform_values in transform_data.data.items(): + transform = getattr(module, transform_name) + apply = transform_values.get("apply") + call_args = transform_values.get("call_args") + if call_args: + transformed_weight = apply( + input_tensor=module.weight, transform=transform, **call_args + ) + else: + transformed_weight = apply( + input_tensor=module.weight, transform=transform + ) + module.weight.data.copy_(transformed_weight) + call_observer(module=module, base_name="weight") + # TODO: what do we do here? + if transform_data is not None: + module.weight.data.copy_(untransformed_weight) + def calibrate_activations(module: Module, value: torch.Tensor, base_name: str): """ diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py index 3a8946aef..f370a79fd 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/base.py +++ b/src/llmcompressor/modifiers/quantization/quantization/base.py @@ -10,6 +10,7 @@ is_preset_scheme, preset_name_to_scheme, ) +from compressed_tensors.transforms.transform_config import TransformationConfig from loguru import logger from pydantic import Field, field_validator from torch.nn import Module @@ -74,6 +75,7 @@ class QuantizationModifier(Modifier): """ config_groups: Optional[Dict[str, QuantizationScheme]] = None + transforms_config: Optional[TransformationConfig] = None ignore: List[str] = Field(default_factory=list) targets: Union[str, List[str]] = Field(default_factory=lambda: ["Linear"]) scheme: Optional[Union[str, Dict[str, Any]]] = None @@ -210,7 +212,9 @@ def _check_calibration_data(self, config: QuantizationConfig): def _apply_modifier_to_model(self, model: Module): modifier_as_config = self.create_init_config() # Add step to attach kv_cache to the model, if present within the config - apply_quantization_config(model, modifier_as_config) + apply_quantization_config( + model, modifier_as_config, transforms_config=self.transforms_config + ) model.apply(set_unset_kv_cache) return modifier_as_config From 4a58bb1520c4b436db0e01bca5837870aabd627b Mon Sep 17 00:00:00 2001 From: Dipika Date: Thu, 6 Mar 2025 23:26:31 +0000 Subject: [PATCH 04/11] clean-up --- examples/weight_transform.py | 121 +++++++++++++----- .../modifiers/quantization/calibration.py | 21 +-- 2 files changed, 97 insertions(+), 45 deletions(-) diff --git a/examples/weight_transform.py b/examples/weight_transform.py index 83e54920a..c980d1945 100644 --- a/examples/weight_transform.py +++ b/examples/weight_transform.py @@ -1,41 +1,62 @@ +import torch +from compressed_tensors.quantization import ( + QuantizationArgs, + QuantizationScheme, + QuantizationStrategy, +) from compressed_tensors.transforms import Hadamard, RandomHadamard, Transforms from compressed_tensors.transforms.transform_args import ( ModuleTarget, TransformationArgs, ) from compressed_tensors.transforms.transform_config import TransformationConfig -from compressed_tensors.transforms.transform_data import TransformData from compressed_tensors.transforms.transform_scheme import TransformationScheme from transformers import AutoModelForCausalLM, AutoTokenizer -import torch -ignore = ["re:*.mlp.down_proj$"] -module_targets = [ModuleTarget.WEIGHTS] +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier + +# U(W)V.T + +ignore = ["re:.*.mlp.down_proj$"] +module_targets = [ModuleTarget.WEIGHT.value] -# Start with a processed -targets = ["Linear"] # 2048 * 2048 +# Start with a processed +targets = ["Linear"] # 2048 * 2048 v_linear_args = TransformationArgs( - targets=targets, module_targets=module_targets, ignore=ignore, call_args={"transpose": True, "first": False} + targets=targets, + module_targets=module_targets, + ignore=ignore, + call_args={"transpose": True, "first": False}, ) -targets = ["re:*.mlp.down_proj$"] # 5632 * 5632 +targets = ["re:.*.mlp.down_proj$"] # 8192 * 8192 v_down_proj = TransformationArgs( - targets=targets, module_targets=module_targets, call_args={"transpose": True, "first": False} + targets=targets, + module_targets=module_targets, + call_args={"transpose": True, "first": False}, ) -targets = ["re:*.attn.q_proj$", "re:*.attn.o_proj$", "re:*.mlp.down_proj$"] # 2048 * 2048 +targets = [ + "re:.*.attn.q_proj$", + "re:.*.attn.o_proj$", + "re:.*.mlp.down_proj$", +] # 2048 * 2048 u_q_o_down_proj = TransformationArgs( - targets=targets, module_targets=module_targets, + targets=targets, + module_targets=module_targets, ) -targets = ["re:*.attn.gate_proj$", "re:*.mlp.up_proj$"] # 5632 * 5632 +targets = ["re:.*.mlp.gate_proj$", "re:.*.mlp.up_proj$"] # 8192 * 8192 u_gate_up_proj = TransformationArgs( - targets=targets, module_targets=module_targets, + targets=targets, + module_targets=module_targets, ) -targets = ["re:*.attn.k_proj$", "re:*.attn.v_proj$"] # 256 * 256 +targets = ["re:.*.attn.k_proj$", "re:.*.attn.v_proj$"] # 512 * 512 u_k_v_proj = TransformationArgs( - targets=targets, module_targets=module_targets, + targets=targets, + module_targets=module_targets, ) @@ -51,7 +72,7 @@ v_scheme_down_proj = TransformationScheme( transform_type="random-hadamard", groups=[v_down_proj], - transform_creation_args={"size": 5632}, + transform_creation_args={"size": 8192}, ) # We could combine multiple args to the same scheme but then would make it more difficult to consolidate order of transforms @@ -64,35 +85,65 @@ u_scheme_gate_up_proj = TransformationScheme( transform_type="random-hadamard", groups=[u_gate_up_proj], - transform_creation_args={"size": 5632}, + transform_creation_args={"size": 8192}, ) u_scheme_k_v_proj = TransformationScheme( transform_type="random-hadamard", groups=[u_k_v_proj], - transform_creation_args={"size": 256}, + transform_creation_args={"size": 512}, ) # QuIP Recipe with weight only quantization config = TransformationConfig( transform_groups={ "u_transform_q_o_down_proj": u_scheme_q_o_down_proj, - "u_transform_gate_up_proj": u_scheme_gate_up_proj, "u_transform_k_v_proj": u_scheme_k_v_proj, + "u_transform_gate_up_proj": u_scheme_gate_up_proj, "v_transform_linear": v_scheme, - "v_transform_down_proj": v_scheme_down_proj + "v_transform_down_proj": v_scheme_down_proj, } ) -#MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" -MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +recipe = QuantizationModifier( + targets="Linear", + ignore=["lm_head"], + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs( + num_bits=4, + symmetric=True, + strategy=QuantizationStrategy.GROUP, + group_size=128, + ), + ) + }, + transforms_config=config, +) + +MODEL_ID = "meta-llama/Llama-3.2-1B" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, - device_map="auto", - torch_dtype="auto", + MODEL_ID, device_map="auto", torch_dtype="auto" ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +oneshot(model=model, recipe=recipe) + +print("\n\n") +print("========== SAMPLE GENERATION ==============") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +output = model.generate(input_ids, max_new_tokens=100) +print(tokenizer.decode(output[0])) +print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-Transforms" +model.save_pretrained(SAVE_DIR) +tokenizer.save_pretrained(SAVE_DIR) + +""" x = model.model.layers[0] attn = x.self_attn mlp = x.mlp @@ -104,16 +155,26 @@ attn.o_proj, mlp.gate_proj, mlp.down_proj, - mlp.up_proj + mlp.up_proj, ] -for layer in layers: +from compressed_tensors.transforms.hadamard_utils import ( + deterministic_hadamard_matrix, + random_hadamard_matrix, +) +for layer in layers: current_weight = layer.weight + original_weight = current_weight.data.clone() (n, m) = current_weight.shape - U = torch.eye(n).to("cuda").to(torch.bfloat16) - V = torch.eye(m).to("cuda").to(torch.bfloat16) - print(n, layer) + + U = torch.Tensor(random_hadamard_matrix(n)).to("cuda").to(torch.float32) + V = torch.Tensor(random_hadamard_matrix(m)).to("cuda").to(torch.float32) output = torch.matmul(U, current_weight) output = torch.matmul(output, V.T) + + # apply untransform + x = torch.matmul(U.T, torch.matmul(output, V)) + print(torch.max(abs(x - original_weight))) +""" diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py index 3245c8604..f20951b88 100644 --- a/src/llmcompressor/modifiers/quantization/calibration.py +++ b/src/llmcompressor/modifiers/quantization/calibration.py @@ -4,6 +4,7 @@ from compressed_tensors.quantization import QuantizationStatus, is_attention_module from compressed_tensors.quantization.lifecycle.forward import forward_quantize from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme +from compressed_tensors.transforms.apply import apply_transforms_to_parameter from compressed_tensors.utils.offload import is_module_offloaded, update_parameter_data from loguru import logger from torch.nn import Module @@ -123,25 +124,15 @@ def update_weight_zp_scale(module: Module): transform_data = getattr(module, "transform_data", None) if transform_data is not None: - # order that the transforms were added to match the order they should be applied untransformed_weight = module.weight.data.clone() - for transform_name, transform_values in transform_data.data.items(): - transform = getattr(module, transform_name) - apply = transform_values.get("apply") - call_args = transform_values.get("call_args") - if call_args: - transformed_weight = apply( - input_tensor=module.weight, transform=transform, **call_args - ) - else: - transformed_weight = apply( - input_tensor=module.weight, transform=transform - ) - module.weight.data.copy_(transformed_weight) + apply_transforms_to_parameter( + module=module, + module_parameter=module.weight, + transform_data=transform_data, + ) call_observer(module=module, base_name="weight") - # TODO: what do we do here? if transform_data is not None: module.weight.data.copy_(untransformed_weight) From 744a31122eac22c0da4e97afaa41b32af309f2fc Mon Sep 17 00:00:00 2001 From: Dipika Date: Fri, 7 Mar 2025 23:48:00 +0000 Subject: [PATCH 05/11] update --- examples/weight_transform.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/weight_transform.py b/examples/weight_transform.py index c980d1945..84c1f1cee 100644 --- a/examples/weight_transform.py +++ b/examples/weight_transform.py @@ -18,7 +18,7 @@ # U(W)V.T -ignore = ["re:.*.mlp.down_proj$"] +ignore = ["re:.*.mlp.down_proj$", "lm_head"] module_targets = [ModuleTarget.WEIGHT.value] # Start with a processed @@ -140,6 +140,7 @@ # Save to disk compressed. SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-Transforms" + model.save_pretrained(SAVE_DIR) tokenizer.save_pretrained(SAVE_DIR) From 3084f39b588eb33b9a46e957190409a91818ddc3 Mon Sep 17 00:00:00 2001 From: Dipika Date: Mon, 10 Mar 2025 16:38:23 +0000 Subject: [PATCH 06/11] update --- examples/weight_transform.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/weight_transform.py b/examples/weight_transform.py index 84c1f1cee..731f50449 100644 --- a/examples/weight_transform.py +++ b/examples/weight_transform.py @@ -15,7 +15,6 @@ from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier - # U(W)V.T ignore = ["re:.*.mlp.down_proj$", "lm_head"] @@ -64,32 +63,32 @@ # It will then apply the second set of args # any overalp will be applied in order v_scheme = TransformationScheme( - transform_type="random-hadamard", + transform_type="hadamard", groups=[v_linear_args], transform_creation_args={"size": 2048}, ) v_scheme_down_proj = TransformationScheme( - transform_type="random-hadamard", + transform_type="hadamard", groups=[v_down_proj], transform_creation_args={"size": 8192}, ) # We could combine multiple args to the same scheme but then would make it more difficult to consolidate order of transforms u_scheme_q_o_down_proj = TransformationScheme( - transform_type="random-hadamard", + transform_type="hadamard", groups=[u_q_o_down_proj], transform_creation_args={"size": 2048}, ) u_scheme_gate_up_proj = TransformationScheme( - transform_type="random-hadamard", + transform_type="hadamard", groups=[u_gate_up_proj], transform_creation_args={"size": 8192}, ) u_scheme_k_v_proj = TransformationScheme( - transform_type="random-hadamard", + transform_type="hadamard", groups=[u_k_v_proj], transform_creation_args={"size": 512}, ) @@ -116,13 +115,14 @@ symmetric=True, strategy=QuantizationStrategy.GROUP, group_size=128, + observer="mse" ), ) }, transforms_config=config, ) -MODEL_ID = "meta-llama/Llama-3.2-1B" +MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct" model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto" @@ -139,9 +139,9 @@ print("==========================================\n\n") # Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-Transforms" +SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-uncompressed-hadamard-random" -model.save_pretrained(SAVE_DIR) +model.save_pretrained(SAVE_DIR, save_compressed=False) tokenizer.save_pretrained(SAVE_DIR) """ From 919be473e5eeb5acbc63d49bf347e6a730bd7524 Mon Sep 17 00:00:00 2001 From: Dipika Date: Tue, 11 Mar 2025 22:06:38 +0000 Subject: [PATCH 07/11] remove test script --- examples/weight_transform.py | 181 ----------------------------------- 1 file changed, 181 deletions(-) delete mode 100644 examples/weight_transform.py diff --git a/examples/weight_transform.py b/examples/weight_transform.py deleted file mode 100644 index 731f50449..000000000 --- a/examples/weight_transform.py +++ /dev/null @@ -1,181 +0,0 @@ -import torch -from compressed_tensors.quantization import ( - QuantizationArgs, - QuantizationScheme, - QuantizationStrategy, -) -from compressed_tensors.transforms import Hadamard, RandomHadamard, Transforms -from compressed_tensors.transforms.transform_args import ( - ModuleTarget, - TransformationArgs, -) -from compressed_tensors.transforms.transform_config import TransformationConfig -from compressed_tensors.transforms.transform_scheme import TransformationScheme -from transformers import AutoModelForCausalLM, AutoTokenizer - -from llmcompressor import oneshot -from llmcompressor.modifiers.quantization import QuantizationModifier -# U(W)V.T - -ignore = ["re:.*.mlp.down_proj$", "lm_head"] -module_targets = [ModuleTarget.WEIGHT.value] - -# Start with a processed -targets = ["Linear"] # 2048 * 2048 -v_linear_args = TransformationArgs( - targets=targets, - module_targets=module_targets, - ignore=ignore, - call_args={"transpose": True, "first": False}, -) - -targets = ["re:.*.mlp.down_proj$"] # 8192 * 8192 -v_down_proj = TransformationArgs( - targets=targets, - module_targets=module_targets, - call_args={"transpose": True, "first": False}, -) - -targets = [ - "re:.*.attn.q_proj$", - "re:.*.attn.o_proj$", - "re:.*.mlp.down_proj$", -] # 2048 * 2048 -u_q_o_down_proj = TransformationArgs( - targets=targets, - module_targets=module_targets, -) - -targets = ["re:.*.mlp.gate_proj$", "re:.*.mlp.up_proj$"] # 8192 * 8192 -u_gate_up_proj = TransformationArgs( - targets=targets, - module_targets=module_targets, -) - -targets = ["re:.*.attn.k_proj$", "re:.*.attn.v_proj$"] # 512 * 512 -u_k_v_proj = TransformationArgs( - targets=targets, - module_targets=module_targets, -) - - -# This will apply the random_had to the first set of args -# It will then apply the second set of args -# any overalp will be applied in order -v_scheme = TransformationScheme( - transform_type="hadamard", - groups=[v_linear_args], - transform_creation_args={"size": 2048}, -) - -v_scheme_down_proj = TransformationScheme( - transform_type="hadamard", - groups=[v_down_proj], - transform_creation_args={"size": 8192}, -) - -# We could combine multiple args to the same scheme but then would make it more difficult to consolidate order of transforms -u_scheme_q_o_down_proj = TransformationScheme( - transform_type="hadamard", - groups=[u_q_o_down_proj], - transform_creation_args={"size": 2048}, -) - -u_scheme_gate_up_proj = TransformationScheme( - transform_type="hadamard", - groups=[u_gate_up_proj], - transform_creation_args={"size": 8192}, -) - -u_scheme_k_v_proj = TransformationScheme( - transform_type="hadamard", - groups=[u_k_v_proj], - transform_creation_args={"size": 512}, -) - -# QuIP Recipe with weight only quantization -config = TransformationConfig( - transform_groups={ - "u_transform_q_o_down_proj": u_scheme_q_o_down_proj, - "u_transform_k_v_proj": u_scheme_k_v_proj, - "u_transform_gate_up_proj": u_scheme_gate_up_proj, - "v_transform_linear": v_scheme, - "v_transform_down_proj": v_scheme_down_proj, - } -) - -recipe = QuantizationModifier( - targets="Linear", - ignore=["lm_head"], - config_groups={ - "group_0": QuantizationScheme( - targets=["Linear"], - weights=QuantizationArgs( - num_bits=4, - symmetric=True, - strategy=QuantizationStrategy.GROUP, - group_size=128, - observer="mse" - ), - ) - }, - transforms_config=config, -) - -MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct" - -model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto" -) -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -oneshot(model=model, recipe=recipe) - -print("\n\n") -print("========== SAMPLE GENERATION ==============") -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") -output = model.generate(input_ids, max_new_tokens=100) -print(tokenizer.decode(output[0])) -print("==========================================\n\n") - -# Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-uncompressed-hadamard-random" - -model.save_pretrained(SAVE_DIR, save_compressed=False) -tokenizer.save_pretrained(SAVE_DIR) - -""" -x = model.model.layers[0] -attn = x.self_attn -mlp = x.mlp - -layers = [ - attn.q_proj, - attn.k_proj, - attn.v_proj, - attn.o_proj, - mlp.gate_proj, - mlp.down_proj, - mlp.up_proj, -] - -from compressed_tensors.transforms.hadamard_utils import ( - deterministic_hadamard_matrix, - random_hadamard_matrix, -) - -for layer in layers: - current_weight = layer.weight - original_weight = current_weight.data.clone() - (n, m) = current_weight.shape - - U = torch.Tensor(random_hadamard_matrix(n)).to("cuda").to(torch.float32) - V = torch.Tensor(random_hadamard_matrix(m)).to("cuda").to(torch.float32) - - output = torch.matmul(U, current_weight) - output = torch.matmul(output, V.T) - - # apply untransform - x = torch.matmul(U.T, torch.matmul(output, V)) - print(torch.max(abs(x - original_weight))) -""" From 82686a56a1b21722a14caf155aec1bb5e3e657f7 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Thu, 3 Apr 2025 21:38:23 +0000 Subject: [PATCH 08/11] update --- .../modifiers/quantization/calibration.py | 25 ++++++---- .../quantization/quantization/base.py | 50 +++++++++++++++---- src/llmcompressor/utils/helpers.py | 1 - 3 files changed, 53 insertions(+), 23 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py index f20951b88..23a987fd4 100644 --- a/src/llmcompressor/modifiers/quantization/calibration.py +++ b/src/llmcompressor/modifiers/quantization/calibration.py @@ -2,9 +2,8 @@ import torch from compressed_tensors.quantization import QuantizationStatus, is_attention_module -from compressed_tensors.quantization.lifecycle.forward import forward_quantize from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme -from compressed_tensors.transforms.apply import apply_transforms_to_parameter +from compressed_tensors.transforms.apply import apply_transforms_to_activations_or_parameter from compressed_tensors.utils.offload import is_module_offloaded, update_parameter_data from loguru import logger from torch.nn import Module @@ -125,9 +124,9 @@ def update_weight_zp_scale(module: Module): transform_data = getattr(module, "transform_data", None) if transform_data is not None: untransformed_weight = module.weight.data.clone() - apply_transforms_to_parameter( + apply_transforms_to_activations_or_parameter( module=module, - module_parameter=module.weight, + module_activation_or_parameter=module.weight, transform_data=transform_data, ) @@ -152,11 +151,22 @@ def calibrate_activations(module: Module, value: torch.Tensor, base_name: str): if value.numel() == 0: return + transform_data = getattr(module, "transform_data", None) + if transform_data is not None: + value = apply_transforms_to_activations_or_parameter( + module=module, + module_activation_or_parameter=value, + transform_data=transform_data, + update_in_place=False + ) + call_observer( module=module, base_name=base_name, value=value, ) + breakpoint() + # validate value is correct def calibrate_input_hook(module: Module, args: Any): @@ -180,12 +190,6 @@ def calibrate_output_hook(module: Module, _args: Any, output: torch.Tensor): value=output, base_name="output", ) - output = forward_quantize( - module=module, - value=output, - base_name="output", - args=module.quantization_scheme.output_activations, - ) return output @@ -211,7 +215,6 @@ def calibrate_kv_cache_output_hook(module: Module, _args: Any, _output: torch.Te update_parameter_data(module, kv_cache.k_scales[module.layer_idx], "k_scale") update_parameter_data(module, kv_cache.v_scales[module.layer_idx], "v_scale") - def set_unset_kv_cache(module: Module): """ Set or unset singleton QuantizedKVParameterCache for each diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py index f370a79fd..b514d0c0b 100644 --- a/src/llmcompressor/modifiers/quantization/quantization/base.py +++ b/src/llmcompressor/modifiers/quantization/quantization/base.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, Set from compressed_tensors.quantization import ( QuantizationArgs, @@ -10,6 +10,11 @@ is_preset_scheme, preset_name_to_scheme, ) +from compressed_tensors.quantization.lifecycle import ( + post_forward_quantize, + pre_forward_quantize, + register_quantization_hooks, +) from compressed_tensors.transforms.transform_config import TransformationConfig from loguru import logger from pydantic import Field, field_validator @@ -85,6 +90,7 @@ class QuantizationModifier(Modifier): calibration_dataloader_: Any = None calibration_function_: Any = None + _handles: Set = set() @field_validator("targets", mode="before") def validate_targets(cls, value: Union[str, List[str]]) -> List[str]: @@ -213,7 +219,10 @@ def _apply_modifier_to_model(self, model: Module): modifier_as_config = self.create_init_config() # Add step to attach kv_cache to the model, if present within the config apply_quantization_config( - model, modifier_as_config, transforms_config=self.transforms_config + model, + modifier_as_config, + transforms_config=self.transforms_config, + delay_forward_quantize=True, ) model.apply(set_unset_kv_cache) return modifier_as_config @@ -262,6 +271,9 @@ def _calibrate_if_possible(self, module: Module): ) elif not self.calibration_dataloader_: + # TODO: should just use HooksMixin + # hooks should have been delayed + module.apply(lambda model: register_quantization_hooks(model)) return module.apply(lambda model: initialize_observer(model, base_name="input")) @@ -269,7 +281,7 @@ def _calibrate_if_possible(self, module: Module): module.apply(self.register_calibration_hooks) self._calibrate(module) module.apply(set_unset_kv_cache) - self.remove_hooks() + self.remove_hooks(self._handles) def register_calibration_hooks(self, module: Module): """ @@ -289,23 +301,39 @@ def register_calibration_hooks(self, module: Module): # Calibrate inputs if an input_quant is provided and not running dynamic quant if calibrate_inputs: - self.register_hook(module, calibrate_input_hook, "forward_pre") + self._handles.add( + self.register_hook(module, calibrate_input_hook, "forward_pre") + ) + + if not is_attention_module_: + self.register_hook(module, pre_forward_quantize, "forward_pre") if output_quant: # hooks for attn modules if running kv_cache quant if is_attention_module_: - self.register_hook( - module, - calibrate_kv_cache_input_hook, - "forward_pre", - with_kwargs=True, + self._handles.add( + self.register_hook( + module, + calibrate_kv_cache_input_hook, + "forward_pre", + with_kwargs=True, + ) ) - self.register_hook(module, calibrate_kv_cache_output_hook, "forward") + self._handles.add( + self.register_hook( + module, calibrate_kv_cache_output_hook, "forward" + ) + ) # hooks for output quant if not running dynamic quant elif not output_quant.dynamic: - self.register_hook(module, calibrate_output_hook, "forward") + self._handles.add( + self.register_hook(module, calibrate_output_hook, "forward") + ) + + if not is_attention_module_: + self.register_hook(module, post_forward_quantize, "forward") def _calibrate(self, module: Module): class_name = self.__class__.__name__.replace("PyTorch", "") diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py index 75fad8311..6c5919652 100644 --- a/src/llmcompressor/utils/helpers.py +++ b/src/llmcompressor/utils/helpers.py @@ -1125,7 +1125,6 @@ def calibration_forward_context(model: PreTrainedModel): with ( torch.no_grad(), DisableKVCache(model), - DisableQuantization(model), eval_context(model), ): yield From a06da00f6609fee7c59ef4c0405459f38bc66583 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Mon, 5 May 2025 21:04:55 +0000 Subject: [PATCH 09/11] add script to apply weight-only transformers --- weight_transform.py | 145 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 weight_transform.py diff --git a/weight_transform.py b/weight_transform.py new file mode 100644 index 000000000..74b61078f --- /dev/null +++ b/weight_transform.py @@ -0,0 +1,145 @@ +import torch +from compressed_tensors.quantization import ( + QuantizationArgs, + QuantizationScheme, + QuantizationStrategy, +) +from compressed_tensors.transforms import Hadamard, RandomHadamard, Transforms +from compressed_tensors.transforms.transform_args import ( + ModuleTarget, + TransformationArgs, +) +from compressed_tensors.transforms.transform_config import TransformationConfig +from compressed_tensors.transforms.transform_scheme import TransformationScheme +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier +# U(W)V.T + +ignore = ["re:.*.mlp.down_proj$", "lm_head"] +module_targets = [ModuleTarget.WEIGHT.value] + +# Start with a processed +targets = ["Linear"] # 2048 * 2048 +v_linear_args = TransformationArgs( + targets=targets, + module_targets=module_targets, + ignore=ignore, + call_args={"transpose": True, "first": False}, +) + +targets = ["re:.*.mlp.down_proj$"] # 8192 * 8192 +v_down_proj = TransformationArgs( + targets=targets, + module_targets=module_targets, + call_args={"transpose": True, "first": False}, +) + +targets = [ + "re:.*.attn.q_proj$", + "re:.*.attn.o_proj$", + "re:.*.mlp.down_proj$", +] # 2048 * 2048 +u_q_o_down_proj = TransformationArgs( + targets=targets, + module_targets=module_targets, +) + +targets = ["re:.*.mlp.gate_proj$", "re:.*.mlp.up_proj$"] # 8192 * 8192 +u_gate_up_proj = TransformationArgs( + targets=targets, + module_targets=module_targets, +) + +targets = ["re:.*.attn.k_proj$", "re:.*.attn.v_proj$"] # 512 * 512 +u_k_v_proj = TransformationArgs( + targets=targets, + module_targets=module_targets, +) + + +# This will apply the random_had to the first set of args +# It will then apply the second set of args +# any overalp will be applied in order +v_scheme = TransformationScheme( + transform_type="hadamard", + groups=[v_linear_args], + transform_creation_args={"size": 2048}, +) + +v_scheme_down_proj = TransformationScheme( + transform_type="hadamard", + groups=[v_down_proj], + transform_creation_args={"size": 8192}, +) + +# We could combine multiple args to the same scheme but then would make it more difficult to consolidate order of transforms +u_scheme_q_o_down_proj = TransformationScheme( + transform_type="hadamard", + groups=[u_q_o_down_proj], + transform_creation_args={"size": 2048}, +) + +u_scheme_gate_up_proj = TransformationScheme( + transform_type="hadamard", + groups=[u_gate_up_proj], + transform_creation_args={"size": 8192}, +) + +u_scheme_k_v_proj = TransformationScheme( + transform_type="hadamard", + groups=[u_k_v_proj], + transform_creation_args={"size": 512}, +) + +# QuIP Recipe with weight only quantization +config = TransformationConfig( + transform_groups={ + "u_transform_q_o_down_proj": u_scheme_q_o_down_proj, + "u_transform_k_v_proj": u_scheme_k_v_proj, + "u_transform_gate_up_proj": u_scheme_gate_up_proj, + "v_transform_linear": v_scheme, + "v_transform_down_proj": v_scheme_down_proj, + } +) + +recipe = QuantizationModifier( + targets="Linear", + ignore=["lm_head"], + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], + weights=QuantizationArgs( + num_bits=4, + symmetric=True, + strategy=QuantizationStrategy.GROUP, + group_size=128, + observer="mse" + ), + ) + }, + transforms_config=config, +) + +MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto" +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +oneshot(model=model, recipe=recipe) + +print("\n\n") +print("========== SAMPLE GENERATION ==============") +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +output = model.generate(input_ids, max_new_tokens=100) +print(tokenizer.decode(output[0])) +print("==========================================\n\n") + +# Save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-uncompressed-hadamard-random-debug" + +model.save_pretrained(SAVE_DIR, save_compressed=False) +tokenizer.save_pretrained(SAVE_DIR) \ No newline at end of file From 80949dc5db12c7b136e8b9ed60a2c61d4a485175 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Mon, 5 May 2025 21:11:26 +0000 Subject: [PATCH 10/11] add transform loading_script --- load_transform_model.py | 111 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 load_transform_model.py diff --git a/load_transform_model.py b/load_transform_model.py new file mode 100644 index 000000000..db8f564d7 --- /dev/null +++ b/load_transform_model.py @@ -0,0 +1,111 @@ +from pathlib import Path + +from safetensors import safe_open +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.utils.quantization_config import CompressedTensorsConfig + +MODEL_ID = "Llama-3.2-1B-Instruct-W4A16-uncompressed-hadamard-random-debug" + +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, + device_map="auto", + torch_dtype="auto", + quantization_config=CompressedTensorsConfig(run_compressed=False), +) +breakpoint() +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") +output = model.generate(input_ids, max_new_tokens=100) +print(tokenizer.decode(output[0])) + +import lm_eval + +results = lm_eval.simple_evaluate( + model="hf", + model_args={ + "pretrained": MODEL_ID, + "add_bos_token": True, + "quantization_config": CompressedTensorsConfig(run_compressed=False), + }, + tasks=["gsm8k"], + num_fewshot=8, + limit=1000, + device="cuda:0", + batch_size=100, +) +print(results["results"]) +""" +For: Llama-3.2-1B-Instruct + +Dense: +{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.379, + 'exact_match_stderr,strict-match': 0.015349091002225352, + 'exact_match,flexible-extract': 0.381, + 'exact_match_stderr,flexible-extract': 0.015364734787007436}} + +----------------------------MINMAX ---------------------------: + +QantModifier - NO TRANSFORMS +{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.177, +'exact_match_stderr,strict-match': 0.011743632866916145, +'exact_match,flexible-extract': 0.179, +'exact_match_stderr,flexible-extract': 0.0117721103708122}} + +QuantModifier - TRANSFORMS (random) +{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.231, +'exact_match_stderr,strict-match': 0.012997843819031815, +'exact_match,flexible-extract': 0.236, +'exact_match_stderr,flexible-extract': 0.01301973553930782}} + +GPTQ +{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.243, +'exact_match_stderr,strict-match': 0.013569640199177434, +'exact_match,flexible-extract': 0.244, +'exact_match_stderr,flexible-extract': 0.013588548437881431}} + + +---------------------------MSE-----------------------------------: +QuantModifier - No Transforms +{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.195, +'exact_match_stderr,strict-match': 0.012535235623319334, +'exact_match,flexible-extract': 0.195, + 'exact_match_stderr,flexible-extract': 0.012535235623319334}} + +QuantModifier - With Transforms (random) +{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.243, +'exact_match_stderr,strict-match': 0.013569640199177457, +'exact_match,flexible-extract': 0.244, + 'exact_match_stderr,flexible-extract': 0.013588548437881412}} + +QuantModifier - With Transforms (not random, not normalized ) +{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.261, +'exact_match_stderr,strict-match': 0.013895037677965126, +'exact_match,flexible-extract': 0.262, +'exact_match_stderr,flexible-extract': 0.013912208651021352}} + +QuantModifier - With Transforms (not random, normalized) +{'gsm8k': {'alias': 'gsm8k', +'exact_match,strict-match': 0.27, +'exact_match_stderr,strict-match': 0.014046255632633915, +'exact_match,flexible-extract': 0.27, + 'exact_match_stderr,flexible-extract': 0.014046255632633915}} + +GPTQ: +{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.285, +'exact_match_stderr,strict-match': 0.014282120955200484, +'exact_match,flexible-extract': 0.286, +'exact_match_stderr,flexible-extract': 0.01429714686251791}} + +---------------------8bit----------------------------------: +QuantModifier - with Transforms (not random, normalized) +{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.371, +'exact_match_stderr,strict-match': 0.015283736211823187, +'exact_match,flexible-extract': 0.372, +'exact_match_stderr,flexible-extract': 0.015292149942040577}} + +GPTQ +{'gsm8k': {'alias': 'gsm8k', 'exact_match,strict-match': 0.364, +'exact_match_stderr,strict-match': 0.01522286884052202, + 'exact_match,flexible-extract': 0.365, + 'exact_match_stderr,flexible-extract': 0.015231776226264903}} +""" From b43b27a2f277a5e62be4f8c713b84fd1c7aa116b Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 6 May 2025 16:45:08 +0000 Subject: [PATCH 11/11] update model --- load_transform_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/load_transform_model.py b/load_transform_model.py index db8f564d7..27d120707 100644 --- a/load_transform_model.py +++ b/load_transform_model.py @@ -4,7 +4,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.utils.quantization_config import CompressedTensorsConfig -MODEL_ID = "Llama-3.2-1B-Instruct-W4A16-uncompressed-hadamard-random-debug" +MODEL_ID = "/home/dsikka/Llama-3.2-1B-Instruct-W4A16-uncompressed-hadamard-random-debug" model = AutoModelForCausalLM.from_pretrained( MODEL_ID,