fix sequential offload for wan (#73)

akaitsuki-ii · web-flow · commit de3af02df09c · 2025-05-27T16:53:00.000+08:00
* fix sequential offload

* fix

* fix
diff --git a/diffsynth_engine/models/wan/wan_image_encoder.py b/diffsynth_engine/models/wan/wan_image_encoder.py
@@ -349,30 +349,6 @@ def __init__(
             embedding_dropout=embedding_dropout,
             norm_eps=norm_eps,
         )
-        self.textual = None
-        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
-
-    def forward(self, imgs, txt_ids):
-        """
-        imgs:       [B, 3, H, W] of torch.float32.
-        - mean:     [0.48145466, 0.4578275, 0.40821073]
-        - std:      [0.26862954, 0.26130258, 0.27577711]
-        txt_ids:    [B, L] of torch.long.
-                    Encoded by data.CLIPTokenizer.
-        """
-        xi = self.visual(imgs)
-        xt = self.textual(txt_ids)
-        return xi, xt
-
-    def param_groups(self):
-        groups = [
-            {
-                "params": [p for n, p in self.named_parameters() if "norm" in n or n.endswith("bias")],
-                "weight_decay": 0.0,
-            },
-            {"params": [p for n, p in self.named_parameters() if not ("norm" in n or n.endswith("bias"))]},
-        ]
-        return groups
 
 
 def _clip(
@@ -444,7 +420,7 @@ def _from_diffusers(self, state_dict):
     def _from_civitai(self, state_dict):
         state_dict_ = {}
         for name, param in state_dict.items():
-            if name.startswith("textual."):
+            if name.startswith(("textual.", "log_scale")):
                 continue
             name = "model." + name
             state_dict_[name] = param
diff --git a/diffsynth_engine/models/wan/wan_text_encoder.py b/diffsynth_engine/models/wan/wan_text_encoder.py
@@ -147,8 +147,6 @@ def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
 
     def forward(self, lq, lk):
         device = self.embedding.weight.device
-        # rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
-        #     torch.arange(lq).unsqueeze(1).to(device)
         rel_pos = torch.arange(lk, device=device).unsqueeze(0) - torch.arange(lq, device=device).unsqueeze(1)
         rel_pos = self._relative_position_bucket(rel_pos)
         rel_pos_embeds = self.embedding(rel_pos)
diff --git a/diffsynth_engine/pipelines/base.py b/diffsynth_engine/pipelines/base.py
@@ -254,26 +254,26 @@ def init_parallel_config(
             "use_fsdp": use_fsdp,
         }
 
-    @staticmethod
-    def validate_offload_mode(offload_mode: str | None):
-        valid_offload_mode = (None, "cpu_offload", "sequential_cpu_offload")
+    def enable_cpu_offload(self, offload_mode: str):
+        valid_offload_mode = ("cpu_offload", "sequential_cpu_offload")
         if offload_mode not in valid_offload_mode:
             raise ValueError(f"offload_mode must be one of {valid_offload_mode}, but got {offload_mode}")
-
-    def enable_cpu_offload(self):
         if self.device == "cpu":
             logger.warning("must set an non cpu device for pipeline before calling enable_cpu_offload")
             return
+        if offload_mode == "cpu_offload":
+            self.enable_model_cpu_offload()
+        elif offload_mode == "sequential_cpu_offload":
+            self.enable_sequential_cpu_offload()
+
+    def enable_model_cpu_offload(self):
         for model_name in self.model_names:
             model = getattr(self, model_name)
             if model is not None:
                 model.to("cpu")
         self.offload_mode = "cpu_offload"
 
     def enable_sequential_cpu_offload(self):
-        if self.device == "cpu":
-            logger.warning("must set an non cpu device for pipeline before calling enable_sequential_cpu_offload")
-            return
         for model_name in self.model_names:
             model = getattr(self, model_name)
             if model is not None:
diff --git a/diffsynth_engine/pipelines/flux_image.py b/diffsynth_engine/pipelines/flux_image.py
@@ -361,8 +361,6 @@ def from_pretrained(
         parallelism: int = 1,
         use_cfg_parallel: bool = False,
     ) -> "FluxImagePipeline":
-        cls.validate_offload_mode(offload_mode)
-
         model_config = (
             model_path_or_config
             if isinstance(model_path_or_config, FluxModelConfig)
@@ -460,10 +458,8 @@ def from_pretrained(
             device=device,
             dtype=dtype,
         )
-        if offload_mode == "cpu_offload":
-            pipe.enable_cpu_offload()
-        elif offload_mode == "sequential_cpu_offload":
-            pipe.enable_sequential_cpu_offload()
+        if offload_mode is not None:
+            pipe.enable_cpu_offload(offload_mode)
         return pipe
 
     def load_loras(self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False):
@@ -751,7 +747,7 @@ def predict_multicontrolnet(
                 # if current_step is not in the control range
                 # skip thie controlnet
                 continue
-            if self.offload_mode == "sequential_cpu_offload" or self.offload_mode == "cpu_offload":
+            if self.offload_mode is not None:
                 empty_cache()
                 param.model.to(self.device)
             double_block_output, single_block_output = param.model(
@@ -765,7 +761,7 @@ def predict_multicontrolnet(
                 image_ids,
                 text_ids,
             )
-            if self.offload_mode == "sequential_cpu_offload" or self.offload_mode == "cpu_offload":
+            if self.offload_mode is not None:
                 empty_cache()
                 param.model.to("cpu")
             double_block_output_results = accumulate(double_block_output_results, double_block_output)
diff --git a/diffsynth_engine/pipelines/sd_image.py b/diffsynth_engine/pipelines/sd_image.py
@@ -190,8 +190,6 @@ def from_pretrained(
         offload_mode: str | None = None,
         batch_cfg: bool = True,
     ) -> "SDImagePipeline":
-        cls.validate_offload_mode(offload_mode)
-
         if isinstance(model_path_or_config, str):
             model_config = SDModelConfig(unet_path=model_path_or_config)
         else:
@@ -237,10 +235,8 @@ def from_pretrained(
             device=device,
             dtype=dtype,
         )
-        if offload_mode == "cpu_offload":
-            pipe.enable_cpu_offload()
-        elif offload_mode == "sequential_cpu_offload":
-            pipe.enable_sequential_cpu_offload()
+        if offload_mode is not None:
+            pipe.enable_cpu_offload(offload_mode)
         return pipe
 
     @classmethod
diff --git a/diffsynth_engine/pipelines/sdxl_image.py b/diffsynth_engine/pipelines/sdxl_image.py
@@ -164,8 +164,6 @@ def from_pretrained(
         offload_mode: str | None = None,
         batch_cfg: bool = True,
     ) -> "SDXLImagePipeline":
-        cls.validate_offload_mode(offload_mode)
-
         if isinstance(model_path_or_config, str):
             model_config = SDXLModelConfig(
                 unet_path=model_path_or_config, unet_dtype=dtype, clip_l_dtype=dtype, clip_g_dtype=dtype
@@ -225,10 +223,8 @@ def from_pretrained(
             device=device,
             dtype=dtype,
         )
-        if offload_mode == "cpu_offload":
-            pipe.enable_cpu_offload()
-        elif offload_mode == "sequential_cpu_offload":
-            pipe.enable_sequential_cpu_offload()
+        if offload_mode is not None:
+            pipe.enable_cpu_offload(offload_mode)
         return pipe
 
     @classmethod
diff --git a/diffsynth_engine/pipelines/wan_video.py b/diffsynth_engine/pipelines/wan_video.py
@@ -159,7 +159,7 @@ def __init__(
         self.vae = vae
         self.image_encoder = image_encoder
         self.batch_cfg = batch_cfg
-        self.model_names = ["text_encoder", "dit", "vae"]
+        self.model_names = ["text_encoder", "dit", "vae", "image_encoder"]
 
     def load_loras(self, lora_list: List[Tuple[str, float]], fused: bool = True, save_original_weight: bool = False):
         assert self.config.tp_degree is None, (
@@ -417,8 +417,6 @@ def from_pretrained(
         parallelism: int = 1,
         use_cfg_parallel: bool = False,
     ) -> "WanVideoPipeline":
-        cls.validate_offload_mode(offload_mode)
-
         if isinstance(model_path_or_config, str):
             model_config = WanModelConfig(model_path=model_path_or_config)
         else:
@@ -523,10 +521,8 @@ def from_pretrained(
             dtype=dtype,
         )
         pipe.eval()
-        if offload_mode == "cpu_offload":
-            pipe.enable_cpu_offload()
-        elif offload_mode == "sequential_cpu_offload":
-            pipe.enable_sequential_cpu_offload()
+        if offload_mode is not None:
+            pipe.enable_cpu_offload(offload_mode)
         return pipe
 
     def __del__(self):
diff --git a/diffsynth_engine/utils/offload.py b/diffsynth_engine/utils/offload.py
@@ -1,44 +1,36 @@
+import torch
 import torch.nn as nn
 
-from diffsynth_engine.models.basic.transformer_helper import RMSNorm
-from diffsynth_engine.models.basic.relative_position_emb import RelativePositionEmbedding
-
-
-SUPPORTED_OFFLOAD_MODULES = (
-    nn.Embedding,
-    nn.Linear,
-    nn.LayerNorm,
-    nn.Conv2d,
-    nn.GroupNorm,
-    RMSNorm,
-    RelativePositionEmbedding,
-)
-
 
 def enable_sequential_cpu_offload(module: nn.Module, device: str = "cuda:0"):
-    if isinstance(module, SUPPORTED_OFFLOAD_MODULES):
-        add_cpu_offload_hook(module, device)
+    if len(list(module.children())) == 0:
+        if len(list(module.parameters())) > 0:  # leaf module with parameters
+            add_cpu_offload_hook(module, device)
         return
+    if len(list(module.parameters(recurse=False))) > 0:  # module with direct parameters
+        add_cpu_offload_hook(module, device, recurse=False)
     for submodule in module.children():
         enable_sequential_cpu_offload(submodule, device)
 
 
-def add_cpu_offload_hook(module: nn.Module, device: str = "cuda:0"):
+# TODO: supports module buffer
+def add_cpu_offload_hook(module: nn.Module, device: str = "cuda:0", recurse: bool = True):
     def _forward_pre_hook(module: nn.Module, input):
         offload_params = {}
-        for name, param in module.named_parameters():
+        for name, param in module.named_parameters(recurse=recurse):
             offload_params[name] = param.data
             param.data = param.data.to(device=device)
         setattr(module, "_offload_params", offload_params)
+        return tuple(x.to(device=device) if isinstance(x, torch.Tensor) else x for x in input)
 
     def _forward_hook(module: nn.Module, input, output):
         offload_params = getattr(module, "_offload_params", {})
-        for name, param in module.named_parameters():
+        for name, param in module.named_parameters(recurse=recurse):
             if name in offload_params:
                 param.data = offload_params[name]
 
-    if getattr(module, "_sequential_cpu_offload_enabled", False):
+    if getattr(module, "_cpu_offload_enabled", False):
         return
     module.register_forward_pre_hook(_forward_pre_hook)
     module.register_forward_hook(_forward_hook)
-    setattr(module, "_sequential_cpu_offload_enabled", True)
+    setattr(module, "_cpu_offload_enabled", True)
diff --git a/diffsynth_engine/utils/parallel.py b/diffsynth_engine/utils/parallel.py
@@ -338,7 +338,6 @@ def __init__(
     ):
         super().__init__()
         self.world_size = cfg_degree * sp_ulysses_degree * sp_ring_degree * tp_degree
-        self.device = device
         self.queue_in = mp.Queue()
         self.queue_out = mp.Queue()
         self.ctx = mp.spawn(