[Transform] Accelerate Utilities (#328)

kylesayrs · web-flow · commit 7b5a7a48b028 · 2025-06-03T17:23:47.000-04:00
* add utilities

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;

* add tests

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;

* add additional tests

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;

* add delete_offload_module

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;

---------

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/utils/offload.py b/src/compressed_tensors/utils/offload.py
@@ -28,15 +28,18 @@
 import contextlib
 import warnings
 from functools import wraps
-from typing import Any, Callable, Dict, Iterable, Literal, Optional, Union
+from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Union
 
 import torch
 
 
 try:
+    from accelerate import dispatch_model
     from accelerate.hooks import (
         AlignDevicesHook,
         add_hook_to_module,
+        attach_align_device_hook,
+        named_module_tensors,
         remove_hook_from_module,
     )
     from accelerate.utils import (
@@ -54,6 +57,9 @@
     OffloadedWeightsLoader = None
     PrefixedDataset = None
     set_module_tensor_to_device = None
+    named_module_tensors = None
+    dispatch_model = None
+    attach_align_device_hook = None
 
 
 __all__ = [
@@ -70,13 +76,21 @@
     "disable_offload",
     "align_modules",
     "align_module_device",
+    "register_offload_module",
+    "delete_offload_module",
+    "force_cpu_offload",
 ]
 
 
 def check_accelerate(fallback: Any):
     def decorator(func: Callable[[Any], Any]):
         if not _has_accelerate:
 
+            if fallback == "error":
+                raise ValueError(
+                    "Please install `accelerate` in order to use this function"
+                )
+
             @wraps(func)
             def fallback_fn(*args, **kwargs):
                 return fallback
@@ -346,6 +360,7 @@ def delete_from_weights_map(
         )
 
 
+@check_accelerate(fallback=contextlib.nullcontext())
 @contextlib.contextmanager
 def disable_offload(module: torch.nn.Module):
     """
@@ -362,6 +377,7 @@ def disable_offload(module: torch.nn.Module):
         yield
 
 
+@check_accelerate(fallback=contextlib.nullcontext())
 @contextlib.contextmanager
 def align_modules(
     modules: Union[torch.nn.Module, Iterable[torch.nn.Module]],
@@ -383,6 +399,123 @@ def align_modules(
         yield
 
 
+def register_offload_module(base: torch.nn.Module, name: str, module: torch.nn.Module):
+    """
+    Register a submodule with offloading if the parent module is offloaded
+
+    :param base: module to attach submodule to
+    :param name: name of submodule
+    :param module: submodule to attach
+    """
+
+    if has_offloaded_params(base):
+        hook: AlignDevicesHook = base._hf_hook
+        assert hook.offload
+        assert hook.weights_map is not None
+        assert hook.tied_params_map is not None
+
+        # offloading kwargs for submodule
+        place_submodules = False
+        offload_buffers = True
+
+        # copy device offloading arguments from parent
+        current_device = next(base.parameters()).device  # assume base has parameters
+        offload_device = get_offloaded_device(base)
+
+        # offload parameters to weights map
+        for param_name, param in named_module_tensors(
+            module, include_buffers=offload_buffers, recurse=place_submodules
+        ):
+            offloaded = param.to(offload_device)
+            hook.tied_params_map[offloaded.data_ptr()] = {}  # (1)
+            offload_to_weights_map(hook.weights_map, f"{name}.{param_name}", offloaded)
+
+            # if the parent places submodules, offload here
+            if hook.place_submodules:
+                set_module_tensor_to_device(module, param_name, current_device)
+
+        # if the parent does not place submodules, then add a hook
+        # parameters are offloaded by `add_hook_to_module`
+        if not hook.place_submodules:
+            weights_map = PrefixedDataset(
+                hook.weights_map.dataset, prefix=f"{hook.weights_map.prefix}{name}."
+            )
+
+            submodule_hook = AlignDevicesHook(
+                execution_device=hook.execution_device,
+                offload=hook.offload,
+                io_same_device=False,
+                weights_map=weights_map,
+                offload_buffers=offload_buffers,
+                place_submodules=place_submodules,
+                skip_keys=None,
+                tied_params_map=hook.tied_params_map,
+            )
+            add_hook_to_module(module, submodule_hook)
+
+    base.register_module(name, module)
+
+    # (1): Since we cannot know which pointers are shared when we add parameters in an
+    # online way, assume that all pointers are shared. This comes at no runtime cost
+
+
+def delete_offload_module(base: torch.nn.Module, name: str):
+    """
+    Delete a submodule from a model which may contain offloading
+    :param base: parent module to delete submodule from
+    :param name: name of submodule on parent
+    """
+    module: torch.nn.Module = getattr(base, name)
+
+    for param_name, _ in list(module.named_parameters()):
+        delete_offload_parameter(module, param_name)
+
+    delattr(base, name)
+
+
+@check_accelerate(fallback="error")
+def force_cpu_offload(
+    module: torch.nn.Module, execution_device: torch.device
+) -> torch.nn.Module:
+    """
+    Force cpu offloading a module, primarily used for testing
+
+    :param module: module containing parameters to offload
+    :param execution_device: execution device submodules
+    :return: module with hooks to perform cpu offloading
+    """
+    # edge case: there is a bug in `dispatch_model` which causes
+    # the function to only work if the model contains submodules
+    if next(module.children(), None) is None:
+        attach_align_device_hook(
+            module,
+            execution_device=execution_device,
+            offload=True,
+            weights_map=module.state_dict(),
+            tied_params_map={},
+        )
+        return module
+
+    device_map = {}
+
+    def collect_device_map(name: List[str], module: torch.nn.Module):
+        if next(module.parameters(recurse=False), None) is not None:
+            device_map[".".join(name)] = "cpu"
+            return
+
+        else:
+            for submodule_name, submodule in module.named_children():
+                name.append(submodule_name)
+                collect_device_map(name, submodule)
+                name.pop()
+
+    collect_device_map([], module)
+
+    return dispatch_model(
+        module, device_map, main_device=execution_device, force_hooks=True
+    )
+
+
 """ Upstreamed Functions """
 
 
diff --git a/tests/test_utils/test_offload.py b/tests/test_utils/test_offload.py
@@ -16,10 +16,13 @@
 from compressed_tensors.utils import (
     align_module_device,
     align_modules,
+    delete_offload_module,
     delete_offload_parameter,
     disable_hf_hook,
+    force_cpu_offload,
     get_execution_device,
     has_offloaded_params,
+    register_offload_module,
     register_offload_parameter,
     update_offload_parameter,
 )
@@ -37,9 +40,17 @@ def forward(self, x):
         return x * self.a + self.b
 
 
+class ExampleModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(1, 2)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
 @requires_accelerate()
 def test_has_offloaded_params():
-    from accelerate.big_modeling import cpu_offload_with_hook
     from accelerate.hooks import attach_align_device_hook, remove_hook_from_module
 
     module = ExampleModule()
@@ -48,10 +59,6 @@ def test_has_offloaded_params():
     attach_align_device_hook(module, offload=False)
     assert not has_offloaded_params(module)
 
-    remove_hook_from_module(module)
-    module, _ = cpu_offload_with_hook(module)
-    assert not has_offloaded_params(module)
-
     remove_hook_from_module(module)
     attach_align_device_hook(module, offload=True, weights_map=module.state_dict())
     assert has_offloaded_params(module)
@@ -334,3 +341,86 @@ def test_offload_to_weights_map():
     weights_map = PrefixedDataset(OffloadedWeightsLoader({name: old_value}), prefix)
     offload_to_weights_map(weights_map, name, new_value)
     assert weights_map[name] == new_value
+
+
+@requires_gpu
+@requires_accelerate()
+@pytest.mark.parametrize("exec_device", [torch.device("cpu"), torch.device("cuda")])
+def test_register_offload_module(exec_device):
+    # no offloading
+    model = ExampleModel()
+    child = torch.nn.Linear(2, 3)
+    register_offload_module(model, "child", child)
+    register_offload_module(model.linear, "child", child)
+    assert child in model.children()
+    assert child in model.linear.children()
+
+    # with offloading
+    model = ExampleModel()
+    child = torch.nn.Linear(2, 3)
+    force_cpu_offload(model, exec_device)
+    register_offload_module(model, "child", child)
+    register_offload_module(model.linear, "child", child)
+    assert child in model.children()
+    assert child in model.linear.children()
+
+    # can run modules
+    model(torch.empty(1))
+    child(torch.empty(2, device=exec_device))
+
+
+@requires_gpu
+@requires_accelerate()
+@pytest.mark.parametrize("exec_device", [torch.device("cpu"), torch.device("cuda")])
+def test_delete_offload_module(exec_device):
+    # no offloading
+    model = ExampleModel()
+    child = torch.nn.Linear(2, 3)
+    register_offload_module(model, "child", child)
+    register_offload_module(model.linear, "child", child)
+    delete_offload_module(model, "child")
+    delete_offload_module(model.linear, "child")
+    assert not child in model.children()
+    assert not child in model.linear.children()
+
+    # with offloading
+    model = ExampleModel()
+    child = torch.nn.Linear(2, 3)
+    force_cpu_offload(model, exec_device)
+    register_offload_module(model, "child", child)
+    register_offload_module(model.linear, "child", child)
+    delete_offload_module(model, "child")
+    delete_offload_module(model.linear, "child")
+    assert not child in model.children()
+    assert not child in model.linear.children()
+
+
+@requires_gpu
+@requires_accelerate()
+@pytest.mark.parametrize("exec_device", [torch.device("cpu"), torch.device("cuda")])
+def test_force_cpu_offload(exec_device):
+    # single module
+    module = torch.nn.Linear(1, 2)
+    module = force_cpu_offload(module, exec_device)
+    assert has_offloaded_params(module)
+    assert module._hf_hook.offload
+    assert module.weight.device == torch.device("meta")
+    assert "weight" in module._hf_hook.weights_map
+    assert module._hf_hook.tied_params_map is not None
+
+    # can run
+    module(torch.empty(1, device=exec_device))
+
+    # model
+    model = ExampleModel()
+    model = force_cpu_offload(model, exec_device)
+    assert not has_offloaded_params(model)
+
+    assert has_offloaded_params(model.linear)
+    assert model.linear._hf_hook.offload
+    assert model.linear.weight.device == torch.device("meta")
+    assert "weight" in model.linear._hf_hook.weights_map
+    assert model.linear._hf_hook.tied_params_map is not None
+
+    # can run
+    model(torch.empty(1, device=exec_device))