From df101ea860ccdd10b92814408a1225dd8de28891 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 7 Jul 2025 12:32:14 -0400 Subject: [PATCH 1/6] remove iters Signed-off-by: Kyle Sayers --- .../modifiers/smoothquant/base.py | 5 +- src/llmcompressor/observers/base.py | 4 +- .../transformers/compression/helpers.py | 4 +- .../compression/quantization_format.py | 3 +- .../compression/sparsity_metadata_config.py | 54 +++++++++---------- .../ast_utils.py/test_auto_wrapper.py | 7 +-- .../compression/test_run_compressed.py | 5 +- .../test_sparsity_metadata_config.py | 9 ---- .../transformers/kv_cache/test_kv_cache.py | 13 ++--- 9 files changed, 41 insertions(+), 63 deletions(-) diff --git a/src/llmcompressor/modifiers/smoothquant/base.py b/src/llmcompressor/modifiers/smoothquant/base.py index 42c35021e..c2b4a4ce3 100644 --- a/src/llmcompressor/modifiers/smoothquant/base.py +++ b/src/llmcompressor/modifiers/smoothquant/base.py @@ -127,10 +127,7 @@ def on_initialize(self, state: State, **kwargs) -> bool: f"Expected start to be None or -1, got {self.end}" ) - if ( - not hasattr(state, 'data') or - state.data.calib is None - ): + if not hasattr(state, "data") or state.data.calib is None: raise ValueError( f"{self.__class__.__name__} requires a calibration dataset to be " "provided" diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py index b7169d1d6..3ee446cf3 100644 --- a/src/llmcompressor/observers/base.py +++ b/src/llmcompressor/observers/base.py @@ -2,6 +2,7 @@ from typing import Any, Iterable, Optional, Tuple, Union import torch +from compressed_tensors import InternalModule from compressed_tensors.quantization.quant_args import ( FP8_E4M3_DATA, QuantizationArgs, @@ -12,12 +13,11 @@ from compressed_tensors.utils import safe_permute from loguru import logger from torch import FloatTensor, IntTensor, Tensor -from torch.nn import Module __all__ = ["Observer"] -class Observer(Module, RegistryMixin): +class Observer(InternalModule, RegistryMixin): """ Base Observer class to be subclassed for specific implementation. Subclasses should override `calculate_qparams` to return a scale, zero_point diff --git a/src/llmcompressor/transformers/compression/helpers.py b/src/llmcompressor/transformers/compression/helpers.py index 1197532bb..2bc6b63f1 100644 --- a/src/llmcompressor/transformers/compression/helpers.py +++ b/src/llmcompressor/transformers/compression/helpers.py @@ -3,7 +3,7 @@ import torch from accelerate.accelerator import get_state_dict_offloaded_model -from compressed_tensors.quantization.utils import iter_named_leaf_modules, module_type +from compressed_tensors.quantization.utils import module_type from compressed_tensors.utils import align_module_device from tqdm import tqdm @@ -163,7 +163,7 @@ def _get_sparse_targets_ignore_dicts( exhaustive_targets = defaultdict(list) exhaustive_ignore = defaultdict(list) - for name, submodule in iter_named_leaf_modules(module): + for name, submodule in module.named_modules(): submodule_type = module_type(submodule) is_target = is_sparse_compression_target( module=submodule, diff --git a/src/llmcompressor/transformers/compression/quantization_format.py b/src/llmcompressor/transformers/compression/quantization_format.py index 59d29bae4..e0822bb9e 100644 --- a/src/llmcompressor/transformers/compression/quantization_format.py +++ b/src/llmcompressor/transformers/compression/quantization_format.py @@ -6,7 +6,6 @@ from compressed_tensors.quantization.utils import ( is_model_quantized, is_module_quantized, - iter_named_leaf_modules, ) __all__ = ["infer_quantization_format"] @@ -107,7 +106,7 @@ def _get_unique_quant_args(model): """ quant_info_weight = [] quant_info_inputs = [] - for _, submodule in iter_named_leaf_modules(model): + for submodule in model.modules(): if is_module_quantized(submodule): weight_scheme = submodule.quantization_scheme.weights input_scheme = submodule.quantization_scheme.input_activations diff --git a/src/llmcompressor/transformers/compression/sparsity_metadata_config.py b/src/llmcompressor/transformers/compression/sparsity_metadata_config.py index 4e64c61d9..fcf68ec92 100644 --- a/src/llmcompressor/transformers/compression/sparsity_metadata_config.py +++ b/src/llmcompressor/transformers/compression/sparsity_metadata_config.py @@ -6,7 +6,6 @@ from compressed_tensors.quantization.utils import ( is_model_quantized, is_module_quantized, - iter_named_leaf_modules, ) from loguru import logger from torch import Tensor @@ -208,33 +207,34 @@ def is_sparse24_bitmask_supported( QuantizationType.FLOAT.value, ] - for _, submodule in iter_named_leaf_modules(model): + for submodule in model.modules(): if is_module_quantized(submodule): - weight_scheme = submodule.quantization_scheme.weights - input_scheme = submodule.quantization_scheme.input_activations - - if weight_scheme and input_scheme: - # weight and activation quantization - # check schemes are supported - for scheme in [weight_scheme, input_scheme]: - scheme_supported = ( - scheme.num_bits == 8 - and scheme.type in supported_scheme_types - ) - if not scheme_supported: - logger.info( - "Quantization scheme not supported," - " turning off sparse 24 compression." - f" Invalid Scheme: {scheme}" - ) - return False - - elif weight_scheme or input_scheme: - # weight only quantization - logger.info( - "Weight only quantization detected, " - "turning off sparse 24 compression." + continue + + weight_scheme = submodule.quantization_scheme.weights + input_scheme = submodule.quantization_scheme.input_activations + + if weight_scheme and input_scheme: + # weight and activation quantization + # check schemes are supported + for scheme in [weight_scheme, input_scheme]: + scheme_supported = ( + scheme.num_bits == 8 and scheme.type in supported_scheme_types ) - return False + if not scheme_supported: + logger.info( + "Quantization scheme not supported," + " turning off sparse 24 compression." + f" Invalid Scheme: {scheme}" + ) + return False + + elif weight_scheme or input_scheme: + # weight only quantization + logger.info( + "Weight only quantization detected, " + "turning off sparse 24 compression." + ) + return False return True diff --git a/tests/llmcompressor/pipelines/sequential/ast_utils.py/test_auto_wrapper.py b/tests/llmcompressor/pipelines/sequential/ast_utils.py/test_auto_wrapper.py index eb913901e..123a334fe 100644 --- a/tests/llmcompressor/pipelines/sequential/ast_utils.py/test_auto_wrapper.py +++ b/tests/llmcompressor/pipelines/sequential/ast_utils.py/test_auto_wrapper.py @@ -120,6 +120,7 @@ def forward(x, y): wrapped_fn = wrapper._wrapper_fn_defs[0] arg_names = {arg.arg for arg in wrapped_fn.args.args} - assert arg_names == {"x", "y"}, ( - f"Expected arguments {{'x', 'y'}}, but got {arg_names}" - ) + assert arg_names == { + "x", + "y", + }, f"Expected arguments {{'x', 'y'}}, but got {arg_names}" diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index 369828a8e..4be243701 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -5,7 +5,6 @@ import torch from compressed_tensors.linear.compressed_linear import CompressedLinear -from compressed_tensors.quantization.utils import iter_named_leaf_modules from parameterized import parameterized_class from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.utils.quantization_config import CompressedTensorsConfig @@ -132,9 +131,7 @@ def setUpClass(cls): def test_compressed_linear_modules_exist(self): compressed_linear_counts = 0 - for _, submodule in iter_named_leaf_modules( - self.compressed_model, - ): + for submodule in self.compressed_model.modules(): if isinstance(submodule, CompressedLinear): compressed_linear_counts += 1 diff --git a/tests/llmcompressor/transformers/compression/test_sparsity_metadata_config.py b/tests/llmcompressor/transformers/compression/test_sparsity_metadata_config.py index 9ec3e0232..18917a55c 100644 --- a/tests/llmcompressor/transformers/compression/test_sparsity_metadata_config.py +++ b/tests/llmcompressor/transformers/compression/test_sparsity_metadata_config.py @@ -32,11 +32,6 @@ def mock_is_model_quantized(model): return model.is_quantized -def mock_iter_named_leaf_modules(model): - for name, module in model.named_modules(): - yield name, module - - # Mock model class class MockModel(Module): def __init__( @@ -99,10 +94,6 @@ def setup_mocks(self, request): f"{SPARSITY_CONFIG_LOCATION}.is_model_quantized", side_effect=mock_is_model_quantized, ), - patch( - f"{SPARSITY_CONFIG_LOCATION}.iter_named_leaf_modules", - side_effect=mock_iter_named_leaf_modules, - ), ] for patcher in patchers: patcher.start() diff --git a/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py b/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py index 5110f0762..acbc479e7 100644 --- a/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py +++ b/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py @@ -4,7 +4,6 @@ import pytest from accelerate import init_empty_weights from compressed_tensors.quantization.lifecycle import KVCacheScaleType -from compressed_tensors.quantization.utils.helpers import iter_named_quantizable_modules from datasets import load_dataset from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.utils.quantization_config import CompressedTensorsConfig @@ -159,9 +158,7 @@ def test_kv_cache_model_state_dict_attr(oneshot_fixture, tmp_path): model = AutoModelForCausalLM.from_pretrained(str(output_dir)) counts = 0 - for name, submodule in iter_named_quantizable_modules( - model, include_children=False, include_attn=True - ): + for name, submodule in model.named_modules(): counts += 1 assert "self_attn" in name assert hasattr(submodule, KVCacheScaleType.VALUE.value) @@ -200,9 +197,7 @@ def test_kv_cache_gptq_config_format(kv_cache_fixture, tmp_path): model = AutoModelForCausalLM.from_pretrained(output_dir) counts = 0 - for name, submodule in iter_named_quantizable_modules( - model, include_children=False, include_attn=True - ): + for name, submodule in model.named_modules(): counts += 1 assert "self_attn" in name assert hasattr(submodule, KVCacheScaleType.VALUE.value) @@ -246,9 +241,7 @@ def test_kv_cache_gptq_model_state_dict_attr(kv_cache_fixture, tmp_path): ) counts = 0 - for name, submodule in iter_named_quantizable_modules( - model, include_children=False, include_attn=True - ): + for name, submodule in model.named_modules(): counts += 1 assert "self_attn" in name assert hasattr(submodule, KVCacheScaleType.VALUE.value) From d8ab33697d3429c8335a4167527dae31fc468b14 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 7 Jul 2025 13:18:14 -0400 Subject: [PATCH 2/6] fix typo Signed-off-by: Kyle Sayers --- .../transformers/compression/sparsity_metadata_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llmcompressor/transformers/compression/sparsity_metadata_config.py b/src/llmcompressor/transformers/compression/sparsity_metadata_config.py index fcf68ec92..298395344 100644 --- a/src/llmcompressor/transformers/compression/sparsity_metadata_config.py +++ b/src/llmcompressor/transformers/compression/sparsity_metadata_config.py @@ -208,7 +208,7 @@ def is_sparse24_bitmask_supported( ] for submodule in model.modules(): - if is_module_quantized(submodule): + if not is_module_quantized(submodule): continue weight_scheme = submodule.quantization_scheme.weights From 515c06ba3901d67b8914e94e71764e1091713f8d Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Mon, 7 Jul 2025 13:21:16 -0400 Subject: [PATCH 3/6] fix kv cache tests Signed-off-by: Kyle Sayers --- .../transformers/kv_cache/test_kv_cache.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py b/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py index acbc479e7..7038c42d4 100644 --- a/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py +++ b/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py @@ -3,7 +3,7 @@ import pytest from accelerate import init_empty_weights -from compressed_tensors.quantization.lifecycle import KVCacheScaleType +from compressed_tensors.quantization import KVCacheScaleType, is_attention_module from datasets import load_dataset from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.utils.quantization_config import CompressedTensorsConfig @@ -159,10 +159,10 @@ def test_kv_cache_model_state_dict_attr(oneshot_fixture, tmp_path): counts = 0 for name, submodule in model.named_modules(): - counts += 1 - assert "self_attn" in name - assert hasattr(submodule, KVCacheScaleType.VALUE.value) - assert hasattr(submodule, KVCacheScaleType.KEY.value) + if is_attention_module(submodule): + counts += 1 + assert hasattr(submodule, KVCacheScaleType.VALUE.value) + assert hasattr(submodule, KVCacheScaleType.KEY.value) assert counts > 0 @@ -198,10 +198,10 @@ def test_kv_cache_gptq_config_format(kv_cache_fixture, tmp_path): counts = 0 for name, submodule in model.named_modules(): - counts += 1 - assert "self_attn" in name - assert hasattr(submodule, KVCacheScaleType.VALUE.value) - assert hasattr(submodule, KVCacheScaleType.KEY.value) + if is_attention_module(submodule): + counts += 1 + assert hasattr(submodule, KVCacheScaleType.VALUE.value) + assert hasattr(submodule, KVCacheScaleType.KEY.value) assert counts > 0 @@ -242,9 +242,9 @@ def test_kv_cache_gptq_model_state_dict_attr(kv_cache_fixture, tmp_path): counts = 0 for name, submodule in model.named_modules(): - counts += 1 - assert "self_attn" in name - assert hasattr(submodule, KVCacheScaleType.VALUE.value) - assert hasattr(submodule, KVCacheScaleType.KEY.value) + if is_attention_module(submodule): + counts += 1 + assert hasattr(submodule, KVCacheScaleType.VALUE.value) + assert hasattr(submodule, KVCacheScaleType.KEY.value) assert counts > 0 From 62e93a7deecba07d79c5ab2e8a8f2b1d2ceb18d8 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 8 Jul 2025 10:37:26 -0400 Subject: [PATCH 4/6] rename to untargetable Signed-off-by: Kyle Sayers --- src/llmcompressor/observers/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py index 3ee446cf3..a4a648e51 100644 --- a/src/llmcompressor/observers/base.py +++ b/src/llmcompressor/observers/base.py @@ -2,7 +2,7 @@ from typing import Any, Iterable, Optional, Tuple, Union import torch -from compressed_tensors import InternalModule +from compressed_tensors import UntargetableModule from compressed_tensors.quantization.quant_args import ( FP8_E4M3_DATA, QuantizationArgs, @@ -17,7 +17,7 @@ __all__ = ["Observer"] -class Observer(InternalModule, RegistryMixin): +class Observer(UntargetableModule, RegistryMixin): """ Base Observer class to be subclassed for specific implementation. Subclasses should override `calculate_qparams` to return a scale, zero_point From 9d1adbff3d61db7c63c63324db7486f0db05d4e6 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 8 Jul 2025 13:36:48 -0400 Subject: [PATCH 5/6] Revert "rename to untargetable" This reverts commit 62e93a7deecba07d79c5ab2e8a8f2b1d2ceb18d8. --- src/llmcompressor/observers/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py index a4a648e51..3ee446cf3 100644 --- a/src/llmcompressor/observers/base.py +++ b/src/llmcompressor/observers/base.py @@ -2,7 +2,7 @@ from typing import Any, Iterable, Optional, Tuple, Union import torch -from compressed_tensors import UntargetableModule +from compressed_tensors import InternalModule from compressed_tensors.quantization.quant_args import ( FP8_E4M3_DATA, QuantizationArgs, @@ -17,7 +17,7 @@ __all__ = ["Observer"] -class Observer(UntargetableModule, RegistryMixin): +class Observer(InternalModule, RegistryMixin): """ Base Observer class to be subclassed for specific implementation. Subclasses should override `calculate_qparams` to return a scale, zero_point From 53f0265a667aefc404393320219a7f07f1ef814f Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 8 Jul 2025 13:38:38 -0400 Subject: [PATCH 6/6] change get_layers Signed-off-by: Kyle Sayers --- src/llmcompressor/utils/pytorch/module.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/src/llmcompressor/utils/pytorch/module.py b/src/llmcompressor/utils/pytorch/module.py index c923af413..72144cc0f 100644 --- a/src/llmcompressor/utils/pytorch/module.py +++ b/src/llmcompressor/utils/pytorch/module.py @@ -8,14 +8,13 @@ from typing import Dict, List, Optional, Tuple, Union import torch +from compressed_tensors import InternalModule from compressed_tensors.quantization.utils import is_module_quantized -from compressed_tensors.transform import TransformBase from torch.nn import Linear, Module, Parameter from torch.nn.modules.conv import _ConvNd from transformers import PreTrainedModel from llmcompressor.core import ModelParameterizedLayer -from llmcompressor.observers import Observer from llmcompressor.utils.fsdp.context import ( fix_fsdp_module_name, summon_full_params_context, @@ -161,18 +160,6 @@ def match_layers_params( return resolved -def is_internal_module(module: Module) -> bool: - """ - llm-compressor adds additional modules to a model, like observers - and transforms, as part of its normal operation - - :param name: name of module - :return: True if name indicates a module internally instantiated by - llm-compressor, otherwise False - """ - return isinstance(module, (TransformBase, Observer)) - - def get_layers( targets: Union[str, List[str]], module: Module, @@ -197,7 +184,7 @@ def get_layers( layer_dict = { name: layer for name, layer in layer_dict.items() - if not is_internal_module(layer) + if not isinstance(layer, InternalModule) } return layer_dict