Skip to content

Deprecate iter_named_leaf_modules and iter_named_quantizable_modules #1628

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jul 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions src/llmcompressor/modifiers/smoothquant/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,7 @@ def on_initialize(self, state: State, **kwargs) -> bool:
f"Expected start to be None or -1, got {self.end}"
)

if (
not hasattr(state, 'data') or
state.data.calib is None
):
if not hasattr(state, "data") or state.data.calib is None:
raise ValueError(
f"{self.__class__.__name__} requires a calibration dataset to be "
"provided"
Expand Down
4 changes: 2 additions & 2 deletions src/llmcompressor/observers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Any, Iterable, Optional, Tuple, Union

import torch
from compressed_tensors import InternalModule
from compressed_tensors.quantization.quant_args import (
FP8_E4M3_DATA,
QuantizationArgs,
Expand All @@ -12,12 +13,11 @@
from compressed_tensors.utils import safe_permute
from loguru import logger
from torch import FloatTensor, IntTensor, Tensor
from torch.nn import Module

__all__ = ["Observer"]


class Observer(Module, RegistryMixin):
class Observer(InternalModule, RegistryMixin):
"""
Base Observer class to be subclassed for specific implementation.
Subclasses should override `calculate_qparams` to return a scale, zero_point
Expand Down
4 changes: 2 additions & 2 deletions src/llmcompressor/transformers/compression/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import torch
from accelerate.accelerator import get_state_dict_offloaded_model
from compressed_tensors.quantization.utils import iter_named_leaf_modules, module_type
from compressed_tensors.quantization.utils import module_type
from compressed_tensors.utils import align_module_device
from tqdm import tqdm

Expand Down Expand Up @@ -163,7 +163,7 @@ def _get_sparse_targets_ignore_dicts(
exhaustive_targets = defaultdict(list)
exhaustive_ignore = defaultdict(list)

for name, submodule in iter_named_leaf_modules(module):
for name, submodule in module.named_modules():
submodule_type = module_type(submodule)
is_target = is_sparse_compression_target(
module=submodule,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from compressed_tensors.quantization.utils import (
is_model_quantized,
is_module_quantized,
iter_named_leaf_modules,
)

__all__ = ["infer_quantization_format"]
Expand Down Expand Up @@ -107,7 +106,7 @@ def _get_unique_quant_args(model):
"""
quant_info_weight = []
quant_info_inputs = []
for _, submodule in iter_named_leaf_modules(model):
for submodule in model.modules():
if is_module_quantized(submodule):
weight_scheme = submodule.quantization_scheme.weights
input_scheme = submodule.quantization_scheme.input_activations
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from compressed_tensors.quantization.utils import (
is_model_quantized,
is_module_quantized,
iter_named_leaf_modules,
)
from loguru import logger
from torch import Tensor
Expand Down Expand Up @@ -208,33 +207,34 @@ def is_sparse24_bitmask_supported(
QuantizationType.FLOAT.value,
]

for _, submodule in iter_named_leaf_modules(model):
if is_module_quantized(submodule):
weight_scheme = submodule.quantization_scheme.weights
input_scheme = submodule.quantization_scheme.input_activations

if weight_scheme and input_scheme:
# weight and activation quantization
# check schemes are supported
for scheme in [weight_scheme, input_scheme]:
scheme_supported = (
scheme.num_bits == 8
and scheme.type in supported_scheme_types
)
if not scheme_supported:
logger.info(
"Quantization scheme not supported,"
" turning off sparse 24 compression."
f" Invalid Scheme: {scheme}"
)
return False

elif weight_scheme or input_scheme:
# weight only quantization
logger.info(
"Weight only quantization detected, "
"turning off sparse 24 compression."
for submodule in model.modules():
if not is_module_quantized(submodule):
continue

weight_scheme = submodule.quantization_scheme.weights
input_scheme = submodule.quantization_scheme.input_activations

if weight_scheme and input_scheme:
# weight and activation quantization
# check schemes are supported
for scheme in [weight_scheme, input_scheme]:
scheme_supported = (
scheme.num_bits == 8 and scheme.type in supported_scheme_types
)
return False
if not scheme_supported:
logger.info(
"Quantization scheme not supported,"
" turning off sparse 24 compression."
f" Invalid Scheme: {scheme}"
)
return False

elif weight_scheme or input_scheme:
# weight only quantization
logger.info(
"Weight only quantization detected, "
"turning off sparse 24 compression."
)
return False

return True
17 changes: 2 additions & 15 deletions src/llmcompressor/utils/pytorch/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,13 @@
from typing import Dict, List, Optional, Tuple, Union

import torch
from compressed_tensors import InternalModule
from compressed_tensors.quantization.utils import is_module_quantized
from compressed_tensors.transform import TransformBase
from torch.nn import Linear, Module, Parameter
from torch.nn.modules.conv import _ConvNd
from transformers import PreTrainedModel

from llmcompressor.core import ModelParameterizedLayer
from llmcompressor.observers import Observer
from llmcompressor.utils.fsdp.context import (
fix_fsdp_module_name,
summon_full_params_context,
Expand Down Expand Up @@ -161,18 +160,6 @@ def match_layers_params(
return resolved


def is_internal_module(module: Module) -> bool:
"""
llm-compressor adds additional modules to a model, like observers
and transforms, as part of its normal operation

:param name: name of module
:return: True if name indicates a module internally instantiated by
llm-compressor, otherwise False
"""
return isinstance(module, (TransformBase, Observer))


def get_layers(
targets: Union[str, List[str]],
module: Module,
Expand All @@ -197,7 +184,7 @@ def get_layers(
layer_dict = {
name: layer
for name, layer in layer_dict.items()
if not is_internal_module(layer)
if not isinstance(layer, InternalModule)
}

return layer_dict
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def forward(x, y):
wrapped_fn = wrapper._wrapper_fn_defs[0]
arg_names = {arg.arg for arg in wrapped_fn.args.args}

assert arg_names == {"x", "y"}, (
f"Expected arguments {{'x', 'y'}}, but got {arg_names}"
)
assert arg_names == {
"x",
"y",
}, f"Expected arguments {{'x', 'y'}}, but got {arg_names}"
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import torch
from compressed_tensors.linear.compressed_linear import CompressedLinear
from compressed_tensors.quantization.utils import iter_named_leaf_modules
from parameterized import parameterized_class
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.utils.quantization_config import CompressedTensorsConfig
Expand Down Expand Up @@ -132,9 +131,7 @@ def setUpClass(cls):

def test_compressed_linear_modules_exist(self):
compressed_linear_counts = 0
for _, submodule in iter_named_leaf_modules(
self.compressed_model,
):
for submodule in self.compressed_model.modules():
if isinstance(submodule, CompressedLinear):
compressed_linear_counts += 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,6 @@ def mock_is_model_quantized(model):
return model.is_quantized


def mock_iter_named_leaf_modules(model):
for name, module in model.named_modules():
yield name, module


# Mock model class
class MockModel(Module):
def __init__(
Expand Down Expand Up @@ -99,10 +94,6 @@ def setup_mocks(self, request):
f"{SPARSITY_CONFIG_LOCATION}.is_model_quantized",
side_effect=mock_is_model_quantized,
),
patch(
f"{SPARSITY_CONFIG_LOCATION}.iter_named_leaf_modules",
side_effect=mock_iter_named_leaf_modules,
),
]
for patcher in patchers:
patcher.start()
Expand Down
39 changes: 16 additions & 23 deletions tests/llmcompressor/transformers/kv_cache/test_kv_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

import pytest
from accelerate import init_empty_weights
from compressed_tensors.quantization.lifecycle import KVCacheScaleType
from compressed_tensors.quantization.utils.helpers import iter_named_quantizable_modules
from compressed_tensors.quantization import KVCacheScaleType, is_attention_module
from datasets import load_dataset
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers.utils.quantization_config import CompressedTensorsConfig
Expand Down Expand Up @@ -159,13 +158,11 @@ def test_kv_cache_model_state_dict_attr(oneshot_fixture, tmp_path):
model = AutoModelForCausalLM.from_pretrained(str(output_dir))

counts = 0
for name, submodule in iter_named_quantizable_modules(
model, include_children=False, include_attn=True
):
counts += 1
assert "self_attn" in name
assert hasattr(submodule, KVCacheScaleType.VALUE.value)
assert hasattr(submodule, KVCacheScaleType.KEY.value)
for name, submodule in model.named_modules():
if is_attention_module(submodule):
counts += 1
assert hasattr(submodule, KVCacheScaleType.VALUE.value)
assert hasattr(submodule, KVCacheScaleType.KEY.value)
assert counts > 0


Expand Down Expand Up @@ -200,13 +197,11 @@ def test_kv_cache_gptq_config_format(kv_cache_fixture, tmp_path):
model = AutoModelForCausalLM.from_pretrained(output_dir)

counts = 0
for name, submodule in iter_named_quantizable_modules(
model, include_children=False, include_attn=True
):
counts += 1
assert "self_attn" in name
assert hasattr(submodule, KVCacheScaleType.VALUE.value)
assert hasattr(submodule, KVCacheScaleType.KEY.value)
for name, submodule in model.named_modules():
if is_attention_module(submodule):
counts += 1
assert hasattr(submodule, KVCacheScaleType.VALUE.value)
assert hasattr(submodule, KVCacheScaleType.KEY.value)

assert counts > 0

Expand Down Expand Up @@ -246,12 +241,10 @@ def test_kv_cache_gptq_model_state_dict_attr(kv_cache_fixture, tmp_path):
)

counts = 0
for name, submodule in iter_named_quantizable_modules(
model, include_children=False, include_attn=True
):
counts += 1
assert "self_attn" in name
assert hasattr(submodule, KVCacheScaleType.VALUE.value)
assert hasattr(submodule, KVCacheScaleType.KEY.value)
for name, submodule in model.named_modules():
if is_attention_module(submodule):
counts += 1
assert hasattr(submodule, KVCacheScaleType.VALUE.value)
assert hasattr(submodule, KVCacheScaleType.KEY.value)

assert counts > 0
Loading