neuralmagic
diff --git a/‎.github/actions/test/action.yml
Lines changed: 17 additions & 0 deletions b/‎.github/actions/test/action.yml
Lines changed: 17 additions & 0 deletions
diff --git a/‎.github/workflows/report.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/report.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test.yml
Lines changed: 29 additions & 17 deletions b/‎.github/workflows/test.yml
Lines changed: 29 additions & 17 deletions
diff --git a/‎.github/workflows/trigger-all.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/trigger-all.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/compressed_tensors/utils/offload.py
Lines changed: 103 additions & 22 deletions b/‎src/compressed_tensors/utils/offload.py
Lines changed: 103 additions & 22 deletions
@@ -22,6 +22,23 @@ runs:
           name: compressed
           extra: "[dev,accelerate]"
 
+    - name: clean up
+      run: |
+          echo "cleaning up disk space..."
+          find . -type f -name '*.whl' -exec rm -rf {} \;
+          python -m pip cache purge
+          sudo rm -rf /usr/local/.ghcup
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo rm -rf /usr/local/lib/android/sdk/ndk
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /usr/local/share/boost
+          if [[ "$(cat /etc/issue)" =~ Ubuntu ]]; then
+              sudo apt-get clean
+          fi
+          df -h
+      shell: bash
+
     - name: test
       id: test
       run: |
 
@@ -120,7 +120,7 @@ jobs:
               shell: bash
 
             - name: report to reportportal
-              uses: neuralmagic/nm-actions/actions/reportportal_submit_execution_results@v1.15.0
+              uses: neuralmagic/nm-actions/actions/reportportal_submit_execution_results@v1.22.0
               with:
                 droute_username: ${{ secrets.DROUTE_USERNAME }}
                 droute_password: ${{ secrets.DROUTE_PASSWORD }}
 
@@ -22,7 +22,9 @@ on:
       whl:
         description: "whl to test (variable appears late binding so unusable outside 'download artifact')"
         type: string
-        required: true
+      run_id:
+        description: run id of the BUILD job that generated the assets
+        type: string
 
   # makes workflow manually callable
   workflow_dispatch:
@@ -44,9 +46,11 @@ on:
         type: string
         required: true
       whl:
-        description: "whl to test (variable appears late binding so unusable outside 'download artifact')"
+        description: "whl to test (provide either whl or run_id)"
+        type: string
+      run_id:
+        description: run id of the BUILD job that generated the assets
         type: string
-        required: true
 
 jobs:
 
@@ -87,11 +91,33 @@ jobs:
 
             - name: download whl
               id: download
+              if: ${{ inputs.whl != '' }}
               uses: actions/download-artifact@v4
               with:
                   name: ${{ inputs.whl }}
                   path: ${{ inputs.whl }}
 
+            # GCP
+            - name: 'Authenticate to Google Cloud'
+              id: auth
+              uses: google-github-actions/auth@v2.1.3
+              with:
+                  project_id: ${{ secrets.GCP_PROJECT }}
+                  workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
+                  service_account: ${{ secrets.GCP_GHA_SA }}
+
+            - name: 'Set up Cloud SDK'
+              uses: 'google-github-actions/setup-gcloud@v2'
+              with:
+                  version: '>= 473.0.0'
+
+            - name: download assets
+              if: ${{ inputs.run_id != '' }}
+              uses: neuralmagic/nm-actions/actions/gcp-download-assets@v1.1.0
+              with:
+                  bucket_source: ${{ secrets.GCP_BUILD_ML_ASSETS2 }}
+                  run_id: ${{ inputs.run_id }}
+
             - name: run tests
               id: test
               uses: ./.github/actions/test/
@@ -109,20 +135,6 @@ jobs:
                   whl: ${{ inputs.whl }}
                   test_status: ${{ steps.test.outputs.status }}
 
-            # GCP
-            - name: 'Authenticate to Google Cloud'
-              id: auth
-              uses: google-github-actions/auth@v2.1.3
-              with:
-                  project_id: ${{ secrets.GCP_PROJECT }}
-                  workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
-                  service_account: ${{ secrets.GCP_GHA_SA }}
-
-            - name: 'Set up Cloud SDK'
-              uses: 'google-github-actions/setup-gcloud@v2'
-              with:
-                  version: '>= 473.0.0'
-
             - name: copy results to GCP
               run: |
                   gcloud storage cp test-results/report.xml ${{ secrets.GCP_BUILD_ML_ASSETS2 }}/${{ github.run_id }}/test-results/report-${{ inputs.test_label }}.xml
 
@@ -32,8 +32,8 @@ jobs:
             wf_category: ${{ inputs.wf_category || 'NIGHTLY' }}
             gitref: ${{ inputs.gitref || 'main' }}
             push_to_pypi: ${{ (github.event.schedule == '30 0 * * *') || inputs.push_to_pypi || false }}
-            test_configs: '[{"python":"3.11.4","label":"ubuntu-22.04","timeout":"40"},
-                            {"python":"3.10.12","label":"ubuntu-24.04","timeout":"40"},
+            test_configs: '[{"python":"3.11.4","label":"ubuntu-24.04","timeout":"40"},
+                            {"python":"3.10.12","label":"ubuntu-22.04","timeout":"40"},
                             {"python":"3.9.17","label":"k8s-h100-solo","timeout":"40"},
                             {"python":"3.12.6","label":"k8s-a100-duo","timeout":"40"}]'
 
 
@@ -31,9 +31,10 @@
 import warnings
 from functools import wraps
 from operator import attrgetter
-from typing import Any, Callable, Dict, Iterable, Literal, Optional, Union
+from typing import Any, Callable, Dict, Iterable, Literal, Optional, Tuple, Union
 
 import torch
+from compressed_tensors.utils import patch_attr
 
 
 try:
@@ -83,6 +84,8 @@
     "register_offload_module",
     "delete_offload_module",
     "offloaded_dispatch",
+    "disable_offloading",
+    "remove_dispatch",
 ]
 
 
@@ -168,22 +171,22 @@ def update_parameter_data(
 
 def get_execution_device(module: torch.nn.Module) -> torch.device:
     """
-    Get the device which inputs should be moved to before module execution
+    Get the device which inputs should be moved to before module execution.
+    Assume that modules execute in the same order as returned by `model.modules()`
 
     :param module: module to check, may be offloaded
     :return: onload device of module
     """
-    if has_offloaded_params(module):
-        return module._hf_hook.execution_device
+    for submodule in module.modules():
+        if has_offloaded_params(submodule):
+            return submodule._hf_hook.execution_device
 
-    first_param = next(module.parameters(), None)
-    if first_param is None:
-        warnings.warn(
-            f"Unable able to infer execution device of {module}, falling back to CPU"
-        )
-        return torch.device("cpu")
+        param = next(submodule.parameters(recurse=False), None)
+        if param is not None:
+            return param.device
 
-    return first_param.device
+    warnings.warn(f"Unable to get execution device of {module}, falling back to CPU")
+    return torch.device("cpu")
 
 
 def register_offload_parameter(
@@ -204,17 +207,32 @@ def register_offload_parameter(
     has_onload = any(p.device != torch.device("meta") for p in module.parameters())
     module.register_parameter(name, parameter)
 
+    # do everything AlignDevicesHook.init_hook does
+    # https://github.com/huggingface/accelerate/blob/main/src/accelerate/hooks.py#L281
     if has_offloaded_params(module):
-        weights_map = module._hf_hook.weights_map
-        offload_to_weights_map(weights_map, name, parameter.data, offload_device)
+        hook: AlignDevicesHook = module._hf_hook
+        assert hook.weights_map is not None
+
+        # append to original_devices
+        hook.original_devices[name] = parameter.device
+
+        # append to weights map
+        offload_to_weights_map(hook.weights_map, name, parameter.data, offload_device)
+
+        # append to tied_params_map
+        offloaded = hook.weights_map[name]
+        if hook.tied_params_map is not None:
+            hook.tied_params_map[offloaded.data_ptr()] = {}  # (1)
+
+        # perform offloading
         if not has_onload:
             set_module_tensor_to_device(module, name, "meta")
 
 
 def update_offload_parameter(
     module: torch.nn.Module,
     name: str,
-    data: Optional[torch.Tensor],
+    data: torch.Tensor,
     offload_device: Optional[Union[torch.device, Literal["disk"]]] = None,
 ):
     """
@@ -227,15 +245,15 @@ def update_offload_parameter(
     :param offload_device: device on which weight will be offloaded to. If None is
         provided, then infer device from parameters on module
     """
-    param = getattr(module, name)
+    param: torch.nn.Parameter = getattr(module, name)
     if param.data.shape != data.shape:
         warnings.warn(
             f"Shape of parameter being updated {param.data.shape} does not match shape "
             f"of update data {data.shape}"
         )
 
     # copy data into onloaded parameter if applicable
-    if param.device != torch.device("meta"):
+    if param.device != torch.device("meta") and data is not param.data:
         param.data.copy_(data)
 
     # update offload dict
@@ -420,7 +438,6 @@ def register_offload_module(base: torch.nn.Module, name: str, module: torch.nn.M
         hook: AlignDevicesHook = base._hf_hook
         assert hook.offload
         assert hook.weights_map is not None
-        assert hook.tied_params_map is not None
 
         # offloading kwargs for submodule
         place_submodules = False
@@ -435,7 +452,8 @@ def register_offload_module(base: torch.nn.Module, name: str, module: torch.nn.M
             module, include_buffers=offload_buffers, recurse=place_submodules
         ):
             offloaded = param.to(offload_device)
-            hook.tied_params_map[offloaded.data_ptr()] = {}  # (1)
+            if hook.tied_params_map is not None:
+                hook.tied_params_map[offloaded.data_ptr()] = {}  # (1)
             offload_to_weights_map(hook.weights_map, f"{name}.{param_name}", offloaded)
 
             # if the parent places submodules, offload here
@@ -463,9 +481,6 @@ def register_offload_module(base: torch.nn.Module, name: str, module: torch.nn.M
 
     base.register_module(name, module)
 
-    # (1): Since we cannot know which pointers are shared when we add parameters in an
-    # online way, assume that all pointers are shared. This comes at no runtime cost
-
 
 def delete_offload_module(base: torch.nn.Module, name: str):
     """
@@ -500,8 +515,13 @@ def offloaded_dispatch(
     if offload_device == "disk":
         raise NotImplementedError("Disk offloading is not currently supported")
 
+    # remove any existing hooks
+    remove_dispatch(module)
+
     # create weights map
-    weights_map = OffloadedWeightsLoader(state_dict=module.state_dict(), device="cpu")
+    state_dict = module.state_dict()
+    state_dict = {key: val.to(offload_device) for key, val in state_dict.items()}
+    weights_map = OffloadedWeightsLoader(state_dict=state_dict, device=offload_device)
 
     # create tied params map
     tied_params = find_tied_parameters(module)
@@ -519,9 +539,66 @@ def offloaded_dispatch(
         weights_map=weights_map,
         tied_params_map=tied_params_map,
     )
+
+    # when saving a model, `PretrainedModel.save_pretrained` will only
+    # onload weights if the following requirements are met
+    # if (
+    #     hasattr(self, "hf_device_map")
+    #     and len(set(self.hf_device_map.values())) > 1
+    #     and ("cpu" in self.hf_device_map.values()
+    #          or "disk" in self.hf_device_map.values())
+    # ):
+    # because this function always offloads, disregard actual devices and
+    # always use `cpu` and `cuda:0` to guarantee this condition passes
+    setattr(module, "hf_device_map", {"fake_offload": "cpu", "fake_exec": "cuda:0"})
+
     return module
 
 
+def remove_dispatch(module: torch.nn.Module) -> torch.nn.Module:
+    """
+    Remove any existing dispatches from module
+
+    :param module: module which may be dispatched with hf hooks
+    :return: module without dispatch
+    """
+    remove_hook_from_module(module, recurse=True)
+    if hasattr(module, "hf_device_map"):
+        delattr(module, "hf_device_map")
+
+    return module
+
+
+@contextlib.contextmanager
+def disable_offloading():
+    """
+    Keep modules onloaded and disable offloading until this context exits.
+    Affects modules which have been hooked with accelerate's `AlignDevicesHook`
+    """
+    original_pre_forward = AlignDevicesHook.pre_forward
+    onloaded_modules: Dict[torch.nn.Module, Tuple[AlignDevicesHook, bool]] = dict()
+
+    # onload once and disable any future onloading/offloading steps
+    def keep_onload_pre_forward(self: AlignDevicesHook, module, *args, **kwargs):
+        ret = original_pre_forward(self, module, *args, **kwargs)
+        if module not in onloaded_modules:
+            onloaded_modules[module] = (self, self.offload)
+            self.offload = False
+        return ret
+
+    # use the patched pre_forward function within the context
+    with patch_attr(AlignDevicesHook, "pre_forward", keep_onload_pre_forward):
+        yield
+
+    # manually offload all modules that were onloaded
+    # update any parameters which may have changed
+    for module, (hook, offload) in onloaded_modules.items():
+        hook.offload = offload
+        for name, param in module.named_parameters(recurse=False):
+            update_offload_parameter(module, name, param.data)
+        hook.post_forward(module, None)
+
+
 """ Upstreamed Functions """
 
 
@@ -589,3 +666,7 @@ def align_module_device(
 
     else:
         yield
+
+
+# (1): Since we cannot know which pointers are shared when we add parameters in an
+# online way, assume that all pointers are shared. This has virtually no runtime cost