make sure to sync current stream before overwriting with pinned params

a-r-r-o-w · a-r-r-o-w · commit e09e716202f4 · 2025-01-19T17:36:58.000+01:00
not doing so will lead to erroneous computations on the GPU and cause bad results
diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
@@ -88,6 +88,7 @@ def onload_(self):
     def offload_(self):
         r"""Offloads the group of modules to the offload_device."""
         if self.stream is not None:
+            torch.cuda.current_stream().synchronize()
             for group_module in self.modules:
                 for param in group_module.parameters():
                     param.data = self.cpu_param_dict[param]
@@ -427,7 +428,7 @@ def _apply_group_offloading_leaf_level(
         cpu_param_dict = {param: param.data for param in module.parameters()}
 
     # Create module groups for leaf modules and apply group offloading hooks
-    for name, submodule in module.named_modules():
+    for submodule in module.modules():
         if not isinstance(submodule, _SUPPORTED_PYTORCH_LAYERS):
             continue
         group = ModuleGroup(
diff --git a/src/diffusers/hooks/hooks.py b/src/diffusers/hooks/hooks.py
@@ -151,8 +151,7 @@ def new_forward(module, *args, **kwargs):
         #         return hook.post_forward(module, output)
 
         new_forward = create_new_forward(fn_ref)
-        new_forward = functools.update_wrapper(functools.partial(new_forward, self._module_ref), forward)
-        self._module_ref.forward = new_forward
+        self._module_ref.forward = functools.update_wrapper(functools.partial(new_forward, self._module_ref), forward)
 
         self.hooks[name] = hook
         self._hook_order.append(name)