Merge remote-tracking branch 'origin' into kylesayrs/reduce-quantized-compression-memory

kylesayrs · kylesayrs · commit 3ac19fae4376 · 2025-04-28T11:59:19.000-04:00
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -76,7 +76,7 @@ jobs:
 
             - name: build
               id: build
-              uses: neuralmagic/nm-actions/actions/build-ml-whl@v1.18.0
+              uses: neuralmagic/nm-actions/actions/build-ml-whl@c7e5a66c382104e1beadcb7dadf429f8ab15b344  # v1.20.0
               with:
                   dev: false
                   release: ${{ inputs.wf_category == 'RELEASE' }}
diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml
@@ -12,16 +12,16 @@ jobs:
   python-tests:
     runs-on: ubuntu-24.04
     steps:
-        - uses: actions/setup-python@v4
+        - uses: actions/setup-python@v5
           with:
             python-version: '3.10'
-        - uses: actions/checkout@v3
+        - uses: actions/checkout@v4
+          with:
+            fetch-depth: 0
+            fetch-tags: true
         - name: Set Env
           run: |
             pip3 install --upgrade pip && pip3 install --upgrade setuptools
-            pip3 install virtualenv
-            virtualenv venv
-            source venv/bin/activate
         - name: "⚙️ Install dependencies"
           run: pip3 install .[dev,accelerate]
         - name: "🔬 Running tests"
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,10 +1,7 @@
 [build-system]
-requires = ["setuptools", "wheel", "setuptools_scm>8"]
+requires = ["setuptools", "wheel", "setuptools_scm==8.2.0"]
 build-backend = "setuptools.build_meta"
 
-[tool.setuptools_scm]
-version_file = "src/compressed_tensors/version.py"
-
 [tool.black]
 line-length = 88
 target-version = ['py36']
diff --git a/setup.py b/setup.py
@@ -101,6 +101,7 @@ def _setup_extras() -> Dict:
     use_scm_version={
         "version_scheme": version_func,
         "local_scheme": localversion_func,
+        "version_file": "src/compressed_tensors/version.py",
     },
     author="Neuralmagic, Inc.",
     author_email="support@neuralmagic.com",
diff --git a/src/compressed_tensors/compressors/quantized_compressors/base.py b/src/compressed_tensors/compressors/quantized_compressors/base.py
@@ -163,7 +163,7 @@ def decompress(
         self,
         path_to_model_or_tensors: Union[str, Path, Dict[str, Any]],
         names_to_scheme: Dict[str, QuantizationScheme],
-        device: torch.device = "cpu",
+        device: str = "cpu",
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
         Reads a compressed state dict located at path_to_model_or_tensors
@@ -172,7 +172,8 @@ def decompress(
         :param path_to_model_or_tensors: path to compressed safetensors model (directory
             with one or more safetensors files) or compressed tensors file
         :param names_to_scheme: quantization scheme for each quantized weight
-        :param device: optional device to load intermediate weights into
+        :param device: optional device to load intermediate weights into (must be `str`,
+            not `torch.device`)
         :return: compressed state dict
         """
         if isinstance(path_to_model_or_tensors, (str, Path)):
@@ -189,7 +190,7 @@ def _decompress_from_path(
         self,
         path_to_model: Union[str, Path, Dict[str, Any]],
         names_to_scheme: Dict[str, QuantizationScheme],
-        device: torch.device,
+        device: str,
     ):
         weight_mappings = get_nested_weight_mappings(
             path_to_model, self.compression_param_names
diff --git a/src/compressed_tensors/utils/offload.py b/src/compressed_tensors/utils/offload.py
@@ -28,7 +28,7 @@
 import contextlib
 import warnings
 from functools import wraps
-from typing import Any, Callable, Dict, Literal, Optional, Union
+from typing import Any, Callable, Dict, Iterable, Literal, Optional, Union
 
 import torch
 
@@ -67,6 +67,8 @@
     "delete_offload_parameter",
     "has_offloaded_params",
     "disable_hf_hook",
+    "disable_offload",
+    "align_modules",
     "align_module_device",
 ]
 
@@ -344,6 +346,43 @@ def delete_from_weights_map(
         )
 
 
+@contextlib.contextmanager
+def disable_offload(module: torch.nn.Module):
+    """
+    Context manager to disable module onloading and offloading. Parameters will stay on
+    their current device
+
+    :param module: module to disable offloading for
+    """
+    if has_offloaded_params(module):
+        module._hf_hook.offload = False
+        yield
+        module._hf_hook.offload = True
+    else:
+        yield
+
+
+@contextlib.contextmanager
+def align_modules(
+    modules: Union[torch.nn.Module, Iterable[torch.nn.Module]],
+    execution_device: Optional[torch.device] = None,
+):
+    """
+    Context manager for onloading modules to a device, and disabling onload and offload
+    attempts triggered by forward calls. Used for sequential onloading of layers
+
+    :param modules: `torch.nn.Module` or iterable of `torch.nn.Module`s to onload
+    :param execution_device: device to onload to
+    """
+    modules = (modules,) if isinstance(modules, torch.nn.Module) else modules
+
+    with contextlib.ExitStack() as stack:
+        for module in modules:
+            stack.enter_context(align_module_device(module, execution_device))
+            stack.enter_context(disable_offload(module))  # disable redundant onloading
+        yield
+
+
 """ Upstreamed Functions """
 
 
diff --git a/tests/test_utils/test_offload.py b/tests/test_utils/test_offload.py
@@ -15,6 +15,7 @@
 import torch
 from compressed_tensors.utils import (
     align_module_device,
+    align_modules,
     delete_offload_parameter,
     disable_hf_hook,
     get_execution_device,
@@ -248,6 +249,35 @@ def test_disable_hf_hook_model_recurse():
     assert hasattr(module2, "_hf_hook")
 
 
+@requires_accelerate()
+def test_align_modules():
+    from accelerate.hooks import attach_align_device_hook
+
+    module0 = ExampleModule()
+    module1 = ExampleModule()
+    module2 = ExampleModule()
+    model = torch.nn.Sequential(module0, torch.nn.Sequential(module1, module2))
+    attach_align_device_hook(
+        model,
+        execution_device=torch.device("cpu"),
+        offload=True,
+        weights_map=model.state_dict(),
+    )
+
+    assert module0.a.device == torch.device("meta")
+    assert module1.a.device == torch.device("meta")
+    assert module2.a.device == torch.device("meta")
+
+    with align_modules((module0, module1)):
+        assert module0.a.device != torch.device("meta")
+        assert module1.a.device != torch.device("meta")
+        assert module2.a.device == torch.device("meta")
+
+    assert module0.a.device == torch.device("meta")
+    assert module1.a.device == torch.device("meta")
+    assert module2.a.device == torch.device("meta")
+
+
 @requires_accelerate()
 def test_offload_to_weights_map():
     from accelerate.utils import OffloadedWeightsLoader, PrefixedDataset