From 38c213ff49cfabb499c6249b6c8203747fcff2b6 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 12 Jun 2025 00:57:16 +0200 Subject: [PATCH] update --- src/diffusers/hooks/group_offloading.py | 4 +++ tests/quantization/bnb/test_4bit.py | 2 +- tests/quantization/bnb/test_mixed_int8.py | 2 +- .../quantization/test_torch_compile_utils.py | 25 +++++++++++++-- tests/quantization/torchao/test_torchao.py | 31 +++++++++++++++++++ 5 files changed, 60 insertions(+), 4 deletions(-) diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py index 565f8f1ff860..f96f6cbbe1ef 100644 --- a/src/diffusers/hooks/group_offloading.py +++ b/src/diffusers/hooks/group_offloading.py @@ -219,6 +219,7 @@ def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module: return module def pre_forward(self, module: torch.nn.Module, *args, **kwargs): + breakpoint() # If there wasn't an onload_leader assigned, we assume that the submodule that first called its forward # method is the onload_leader of the group. if self.group.onload_leader is None: @@ -285,6 +286,7 @@ def callback(): return module def post_forward(self, module, output): + breakpoint() # At this point, for the current modules' submodules, we know the execution order of the layers. We can now # remove the layer execution tracker hooks and apply prefetching by setting the next_group attribute for each # group offloading hook. @@ -624,7 +626,9 @@ def _apply_group_offloading_leaf_level( modules_with_group_offloading = set() for name, submodule in module.named_modules(): if not isinstance(submodule, _SUPPORTED_PYTORCH_LAYERS): + print("unsupported module", name, type(submodule)) continue + print("applying group offloading to", name, type(submodule)) group = ModuleGroup( modules=[submodule], offload_device=offload_device, diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py index 2d8b9f698bfe..c6d59e8b71ed 100644 --- a/tests/quantization/bnb/test_4bit.py +++ b/tests/quantization/bnb/test_4bit.py @@ -881,4 +881,4 @@ def test_torch_compile_with_cpu_offload(self): super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config) def test_torch_compile_with_group_offload(self): - super()._test_torch_compile_with_group_offload(quantization_config=self.quantization_config) + super()._test_torch_compile_with_group_offload_leaf_stream(quantization_config=self.quantization_config) diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index b15a9f72a8f6..fc4d6127fef9 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -845,6 +845,6 @@ def test_torch_compile_with_cpu_offload(self): @pytest.mark.xfail(reason="Test fails because of an offloading problem from Accelerate with confusion in hooks.") def test_torch_compile_with_group_offload(self): - super()._test_torch_compile_with_group_offload( + super()._test_torch_compile_with_group_offload_leaf_stream( quantization_config=self.quantization_config, torch_dtype=torch.float16 ) diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py index 1ae77b27d7cd..63d09922f11e 100644 --- a/tests/quantization/test_torch_compile_utils.py +++ b/tests/quantization/test_torch_compile_utils.py @@ -64,7 +64,29 @@ def _test_torch_compile_with_cpu_offload(self, quantization_config, torch_dtype= # small resolutions to ensure speedy execution. pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256) - def _test_torch_compile_with_group_offload(self, quantization_config, torch_dtype=torch.bfloat16): + def _test_torch_compile_with_group_offload_leaf(self, quantization_config, torch_dtype=torch.bfloat16): + torch._dynamo.config.cache_size_limit = 10000 + + pipe = self._init_pipeline(quantization_config, torch_dtype) + group_offload_kwargs = { + "onload_device": torch.device("cuda"), + "offload_device": torch.device("cpu"), + "offload_type": "leaf_level", + "num_blocks_per_group": 1, + "use_stream": False, + } + pipe.transformer.enable_group_offload(**group_offload_kwargs) + # pipe.transformer.compile() + for name, component in pipe.components.items(): + if name != "transformer" and isinstance(component, torch.nn.Module): + if torch.device(component.device).type == "cpu": + component.to("cuda") + + for _ in range(2): + # small resolutions to ensure speedy execution. + pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256) + + def _test_torch_compile_with_group_offload_leaf_stream(self, quantization_config, torch_dtype=torch.bfloat16): torch._dynamo.config.cache_size_limit = 10000 pipe = self._init_pipeline(quantization_config, torch_dtype) @@ -73,7 +95,6 @@ def _test_torch_compile_with_group_offload(self, quantization_config, torch_dtyp "offload_device": torch.device("cpu"), "offload_type": "leaf_level", "use_stream": True, - "non_blocking": True, } pipe.transformer.enable_group_offload(**group_offload_kwargs) pipe.transformer.compile() diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py index 743da17356f7..9ab6a3242a56 100644 --- a/tests/quantization/torchao/test_torchao.py +++ b/tests/quantization/torchao/test_torchao.py @@ -29,6 +29,7 @@ TorchAoConfig, ) from diffusers.models.attention_processor import Attention +from diffusers.quantizers import PipelineQuantizationConfig from diffusers.utils.testing_utils import ( backend_empty_cache, backend_synchronize, @@ -44,6 +45,8 @@ torch_device, ) +from ..test_torch_compile_utils import QuantCompileTests + enable_full_determinism() @@ -625,6 +628,34 @@ def test_int_a16w8_cpu(self): self._check_serialization_expected_slice(quant_method, quant_method_kwargs, expected_slice, device) +@require_torchao_version_greater_or_equal("0.7.0") +class TorchAoCompileTest(QuantCompileTests): + quantization_config = PipelineQuantizationConfig( + quant_mapping={ + "transformer": TorchAoConfig(quant_type="int8_weight_only"), + }, + ) + + def test_torch_compile(self): + super()._test_torch_compile(quantization_config=self.quantization_config) + + def test_torch_compile_with_cpu_offload(self): + super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config) + + def test_torch_compile_with_group_offload_leaf(self): + from diffusers.utils.logging import set_verbosity_debug + + set_verbosity_debug() + super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config) + + @unittest.skip( + "Using non-default stream requires ability to pin tensors. AQT does not seem to support this yet in TorchAO." + ) + def test_torch_compile_with_group_offload_leaf_stream(self): + # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=, types=(,), arg_types=(,), kwarg_types={} + super()._test_torch_compile_with_group_offload_leaf_stream(quantization_config=self.quantization_config) + + # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners @require_torch @require_torch_accelerator