Merge

kylesayrs · kylesayrs · commit 955f2f54c51b · 2025-06-23T11:31:17.000-04:00
diff --git a/.github/workflows/report.yml b/.github/workflows/report.yml
@@ -120,7 +120,7 @@ jobs:
               shell: bash
 
             - name: report to reportportal
-              uses: neuralmagic/nm-actions/actions/reportportal_submit_execution_results@v1.15.0
+              uses: neuralmagic/nm-actions/actions/reportportal_submit_execution_results@v1.22.0
               with:
                 droute_username: ${{ secrets.DROUTE_USERNAME }}
                 droute_password: ${{ secrets.DROUTE_PASSWORD }}
diff --git a/src/compressed_tensors/utils/offload.py b/src/compressed_tensors/utils/offload.py
@@ -14,27 +14,30 @@
 """
 Utilities associated with offloading functionality provided by `accelerate`.
 
-| ----------------------------------------------------------------------------------------------------- | # noqa: E501
-| Operation | Without offloading support             | With offloading support                          | # noqa: E501
-| --------- | -------------------------------------- | ------------------------------------------------ | # noqa: E501
-| Add       | module.register_parameter(name, param) | register_offload_parameter(module, name, param)  | # noqa: E501
-| Check     | N/A                                    | has_offloaded_params(module)                     | # noqa: E501
-| Onload    | N/A                                    | with align_module_device(module)                 | # noqa: E501
-| Update    | module.name.data.copy_(new_data)       | update_offload_parameter(module, name, new_data) | # noqa: E501
-| Delete    | del module.name                        | delete_offload_parameter(module, name)           | # noqa: E501
-| ----------------------------------------------------------------------------------------------------- | # noqa: E501
+| ------------------------------------------------------------------------------------------------------ | # noqa: E501
+| Operation  | Without offloading support             | With offloading support                          | # noqa: E501
+| ---------- | -------------------------------------- | ------------------------------------------------ | # noqa: E501
+| Add        | module.register_parameter(name, param) | register_offload_parameter(module, name, param)  | # noqa: E501
+| Check      | N/A                                    | has_offloaded_params(module)                     | # noqa: E501
+| Onload     | N/A                                    | with align_module_device(module)                 | # noqa: E501
+| Update     | module.name.data.copy_(new_data)       | update_offload_parameter(module, name, new_data) | # noqa: E501
+| Delete     | del module.name                        | delete_offload_parameter(module, name)           | # noqa: E501
+| Add Module | module.register_module(name, child)    | register_offload_module(name, child)             | # noqa: E501
+| Del Module | del module.name                        | delete_offload_module(module, name)              | # noqa: E501
+| ------------------------------------------------------------------------------------------------------ | # noqa: E501
 """
 
 import contextlib
 import warnings
 from functools import wraps
-from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Union
+from operator import attrgetter
+from typing import Any, Callable, Dict, Iterable, Literal, Optional, Tuple, Union
 
 import torch
+from compressed_tensors.utils import patch_attr
 
 
 try:
-    from accelerate import dispatch_model
     from accelerate.hooks import (
         AlignDevicesHook,
         add_hook_to_module,
@@ -45,10 +48,12 @@
     from accelerate.utils import (
         OffloadedWeightsLoader,
         PrefixedDataset,
+        find_tied_parameters,
         set_module_tensor_to_device,
     )
 
     _has_accelerate = True
+
 except ImportError:
     _has_accelerate = False
     AlignDevicesHook = None
@@ -58,8 +63,8 @@
     PrefixedDataset = None
     set_module_tensor_to_device = None
     named_module_tensors = None
-    dispatch_model = None
     attach_align_device_hook = None
+    find_tied_parameters = None
 
 
 __all__ = [
@@ -78,14 +83,15 @@
     "align_module_device",
     "register_offload_module",
     "delete_offload_module",
-    "force_cpu_offload",
+    "offloaded_dispatch",
+    "disable_offloading",
+    "remove_dispatch",
 ]
 
 
 def check_accelerate(fallback: Any):
     def decorator(func: Callable[[Any], Any]):
         if not _has_accelerate:
-
             if fallback == "error":
 
                 @wraps(func)
@@ -165,22 +171,22 @@ def update_parameter_data(
 
 def get_execution_device(module: torch.nn.Module) -> torch.device:
     """
-    Get the device which inputs should be moved to before module execution
+    Get the device which inputs should be moved to before module execution.
+    Assume that modules execute in the same order as returned by `model.modules()`
 
     :param module: module to check, may be offloaded
     :return: onload device of module
     """
-    if has_offloaded_params(module):
-        return module._hf_hook.execution_device
+    for submodule in module.modules():
+        if has_offloaded_params(submodule):
+            return submodule._hf_hook.execution_device
 
-    first_param = next(module.parameters(), None)
-    if first_param is None:
-        warnings.warn(
-            f"Unable able to infer execution device of {module}, falling back to CPU"
-        )
-        return torch.device("cpu")
+        param = next(submodule.parameters(recurse=False), None)
+        if param is not None:
+            return param.device
 
-    return first_param.device
+    warnings.warn(f"Unable to get execution device of {module}, falling back to CPU")
+    return torch.device("cpu")
 
 
 def register_offload_parameter(
@@ -201,17 +207,32 @@ def register_offload_parameter(
     has_onload = any(p.device != torch.device("meta") for p in module.parameters())
     module.register_parameter(name, parameter)
 
+    # do everything AlignDevicesHook.init_hook does
+    # https://github.com/huggingface/accelerate/blob/main/src/accelerate/hooks.py#L281
     if has_offloaded_params(module):
-        weights_map = module._hf_hook.weights_map
-        offload_to_weights_map(weights_map, name, parameter.data, offload_device)
+        hook: AlignDevicesHook = module._hf_hook
+        assert hook.weights_map is not None
+
+        # append to original_devices
+        hook.original_devices[name] = parameter.device
+
+        # append to weights map
+        offload_to_weights_map(hook.weights_map, name, parameter.data, offload_device)
+
+        # append to tied_params_map
+        offloaded = hook.weights_map[name]
+        if hook.tied_params_map is not None:
+            hook.tied_params_map[offloaded.data_ptr()] = {}  # (1)
+
+        # perform offloading
         if not has_onload:
             set_module_tensor_to_device(module, name, "meta")
 
 
 def update_offload_parameter(
     module: torch.nn.Module,
     name: str,
-    data: Optional[torch.Tensor],
+    data: torch.Tensor,
     offload_device: Optional[Union[torch.device, Literal["disk"]]] = None,
 ):
     """
@@ -224,15 +245,15 @@ def update_offload_parameter(
     :param offload_device: device on which weight will be offloaded to. If None is
         provided, then infer device from parameters on module
     """
-    param = getattr(module, name)
+    param: torch.nn.Parameter = getattr(module, name)
     if param.data.shape != data.shape:
         warnings.warn(
             f"Shape of parameter being updated {param.data.shape} does not match shape "
             f"of update data {data.shape}"
         )
 
     # copy data into onloaded parameter if applicable
-    if param.device != torch.device("meta"):
+    if param.device != torch.device("meta") and data is not param.data:
         param.data.copy_(data)
 
     # update offload dict
@@ -417,7 +438,6 @@ def register_offload_module(base: torch.nn.Module, name: str, module: torch.nn.M
         hook: AlignDevicesHook = base._hf_hook
         assert hook.offload
         assert hook.weights_map is not None
-        assert hook.tied_params_map is not None
 
         # offloading kwargs for submodule
         place_submodules = False
@@ -432,7 +452,8 @@ def register_offload_module(base: torch.nn.Module, name: str, module: torch.nn.M
             module, include_buffers=offload_buffers, recurse=place_submodules
         ):
             offloaded = param.to(offload_device)
-            hook.tied_params_map[offloaded.data_ptr()] = {}  # (1)
+            if hook.tied_params_map is not None:
+                hook.tied_params_map[offloaded.data_ptr()] = {}  # (1)
             offload_to_weights_map(hook.weights_map, f"{name}.{param_name}", offloaded)
 
             # if the parent places submodules, offload here
@@ -460,9 +481,6 @@ def register_offload_module(base: torch.nn.Module, name: str, module: torch.nn.M
 
     base.register_module(name, module)
 
-    # (1): Since we cannot know which pointers are shared when we add parameters in an
-    # online way, assume that all pointers are shared. This comes at no runtime cost
-
 
 def delete_offload_module(base: torch.nn.Module, name: str):
     """
@@ -479,46 +497,106 @@ def delete_offload_module(base: torch.nn.Module, name: str):
 
 
 @check_accelerate(fallback="error")
-def force_cpu_offload(
-    module: torch.nn.Module, execution_device: torch.device
+def offloaded_dispatch(
+    module: torch.nn.Module,
+    execution_device: torch.device,
+    offload_device: Union[torch.device, Literal["disk"]] = torch.device("cpu"),
 ) -> torch.nn.Module:
     """
-    Force cpu offloading a module, primarily used for testing
+    Unlike `dispatch_model`, this function forces a module (and its submodules) to
+    offload all parameters and replace them with meta tensors, utiliizing the
+    `AlignDevicesHook` to control onloading and offloading.
 
     :param module: module containing parameters to offload
-    :param execution_device: execution device submodules
-    :return: module with hooks to perform cpu offloading
-    """
-    # edge case: there is a bug in `dispatch_model` which causes
-    # the function to only work if the model contains submodules
-    if next(module.children(), None) is None:
-        attach_align_device_hook(
-            module,
-            execution_device=execution_device,
-            offload=True,
-            weights_map=module.state_dict(),
-            tied_params_map={},
-        )
-        return module
+    :param execution_device: device that modules will onload and execute on
+    :param offload_device: device that module parameters will offload to
+    :return: module with offloading device hooks
+    """
+    if offload_device == "disk":
+        raise NotImplementedError("Disk offloading is not currently supported")
+
+    # remove any existing hooks
+    remove_dispatch(module)
+
+    # create weights map
+    state_dict = module.state_dict()
+    state_dict = {key: val.to(offload_device) for key, val in state_dict.items()}
+    weights_map = OffloadedWeightsLoader(state_dict=state_dict, device=offload_device)
+
+    # create tied params map
+    tied_params = find_tied_parameters(module)
+    tied_params_map = {}
+    for group in tied_params:
+        for param_name in group:
+            data_ptr = attrgetter(param_name)(module).data_ptr()
+            tied_params_map[data_ptr] = {}
+
+    # recursively attaches hooks to all submodules
+    attach_align_device_hook(
+        module,
+        execution_device=execution_device,
+        offload=True,
+        weights_map=weights_map,
+        tied_params_map=tied_params_map,
+    )
 
-    device_map = {}
+    # when saving a model, `PretrainedModel.save_pretrained` will only
+    # onload weights if the following requirements are met
+    # if (
+    #     hasattr(self, "hf_device_map")
+    #     and len(set(self.hf_device_map.values())) > 1
+    #     and ("cpu" in self.hf_device_map.values()
+    #          or "disk" in self.hf_device_map.values())
+    # ):
+    # because this function always offloads, disregard actual devices and
+    # always use `cpu` and `cuda:0` to guarantee this condition passes
+    setattr(module, "hf_device_map", {"fake_offload": "cpu", "fake_exec": "cuda:0"})
 
-    def collect_device_map(name: List[str], module: torch.nn.Module):
-        if next(module.parameters(recurse=False), None) is not None:
-            device_map[".".join(name)] = "cpu"
-            return
+    return module
 
-        else:
-            for submodule_name, submodule in module.named_children():
-                name.append(submodule_name)
-                collect_device_map(name, submodule)
-                name.pop()
 
-    collect_device_map([], module)
+def remove_dispatch(module: torch.nn.Module) -> torch.nn.Module:
+    """
+    Remove any existing dispatches from module
+
+    :param module: module which may be dispatched with hf hooks
+    :return: module without dispatch
+    """
+    remove_hook_from_module(module, recurse=True)
+    if hasattr(module, "hf_device_map"):
+        delattr(module, "hf_device_map")
+
+    return module
 
-    return dispatch_model(
-        module, device_map, main_device=execution_device, force_hooks=True
-    )
+
+@contextlib.contextmanager
+def disable_offloading():
+    """
+    Keep modules onloaded and disable offloading until this context exits.
+    Affects modules which have been hooked with accelerate's `AlignDevicesHook`
+    """
+    original_pre_forward = AlignDevicesHook.pre_forward
+    onloaded_modules: Dict[torch.nn.Module, Tuple[AlignDevicesHook, bool]] = dict()
+
+    # onload once and disable any future onloading/offloading steps
+    def keep_onload_pre_forward(self: AlignDevicesHook, module, *args, **kwargs):
+        ret = original_pre_forward(self, module, *args, **kwargs)
+        if module not in onloaded_modules:
+            onloaded_modules[module] = (self, self.offload)
+            self.offload = False
+        return ret
+
+    # use the patched pre_forward function within the context
+    with patch_attr(AlignDevicesHook, "pre_forward", keep_onload_pre_forward):
+        yield
+
+    # manually offload all modules that were onloaded
+    # update any parameters which may have changed
+    for module, (hook, offload) in onloaded_modules.items():
+        hook.offload = offload
+        for name, param in module.named_parameters(recurse=False):
+            update_offload_parameter(module, name, param.data)
+        hook.post_forward(module, None)
 
 
 """ Upstreamed Functions """
@@ -588,3 +666,7 @@ def align_module_device(
 
     else:
         yield
+
+
+# (1): Since we cannot know which pointers are shared when we add parameters in an
+# online way, assume that all pointers are shared. This has virtually no runtime cost
diff --git a/tests/test_transform/factory/test_correctness.py b/tests/test_transform/factory/test_correctness.py
@@ -21,7 +21,7 @@
     TransformScheme,
     apply_transform_config,
 )
-from compressed_tensors.utils import force_cpu_offload
+from compressed_tensors.utils import offloaded_dispatch
 from tests.testing_utils import requires_accelerate, requires_gpu
 
 
@@ -65,7 +65,7 @@ def test_correctness_model(scheme_kwargs, model_apply, offload=False):
     # load model
     model = model_apply[0]
     if offload:
-        model = force_cpu_offload(model, torch.device("cuda"))
+        model = offloaded_dispatch(model, torch.device("cuda"))
 
     # get output
     input = torch.rand((17, model.fcs[0].in_features))
diff --git a/tests/test_transform/factory/test_memory.py b/tests/test_transform/factory/test_memory.py
@@ -24,7 +24,7 @@
     TransformScheme,
     apply_transform_config,
 )
-from compressed_tensors.utils import align_modules, force_cpu_offload
+from compressed_tensors.utils import align_modules, offloaded_dispatch
 from tests.test_transform.conftest import TransformableModel
 from tests.testing_utils import requires_accelerate, requires_gpu
 
@@ -41,7 +41,7 @@ def test_memory_sharing(scheme_kwargs, offload=False):
     # load model (maybe with offloading)
     model = TransformableModel(2, 2, 4, 4, 8, 8)
     if offload:
-        force_cpu_offload(model, torch.device("cuda"))
+        offloaded_dispatch(model, torch.device("cuda"))
 
     # add transforms to model
     config = TransformConfig(
diff --git a/tests/test_utils/test_offload.py b/tests/test_utils/test_offload.py