fix sdxl lora load (#108)

Glaceon-Hyy · web-flow · commit 5c149ece6ffa · 2025-07-03T16:30:23.000+08:00
* fix sdxl lora load

* fix device
diff --git a/diffsynth_engine/models/sd/sd_controlnet.py b/diffsynth_engine/models/sd/sd_controlnet.py
@@ -10,8 +10,6 @@
     AttentionBlock,
     PushBlock,
     DownSampler,
-    PopBlock,
-    UpSampler,
 )
 
 class ControlNetConditioningLayer(nn.Module):
@@ -565,7 +563,6 @@ def forward(
         time_emb = self.time_embedding(timestep, dtype=sample.dtype)
 
         # 2. pre-process
-        height, width = sample.shape[2], sample.shape[3]
         hidden_states = self.conv_in(sample) + self.controlnet_conv_in(conditioning)
         text_emb = encoder_hidden_states
         res_stack = [hidden_states]
diff --git a/diffsynth_engine/models/sdxl/sdxl_controlnet.py b/diffsynth_engine/models/sdxl/sdxl_controlnet.py
@@ -1,13 +1,10 @@
 import torch
-import torch.nn as nn
 from typing import Optional, Dict
 from diffsynth_engine.models.basic.unet_helper import (
     ResnetBlock,
     AttentionBlock,
     PushBlock,
     DownSampler,
-    PopBlock,
-    UpSampler,
 )
 from diffsynth_engine.models.sd.sd_controlnet import ControlNetConditioningLayer
 from diffsynth_engine.models.base import PreTrainedModel, StateDictConverter
@@ -283,7 +280,6 @@ def forward(
         time_emb = t_emb + add_embeds + control_embeds
 
         # 2. pre-process
-        height, width = sample.shape[2], sample.shape[3]
         hidden_states = self.conv_in(sample)
         hidden_states = self.fuse_condition_to_input(hidden_states, task_id, conditioning)
         text_emb = encoder_hidden_states
diff --git a/diffsynth_engine/pipelines/controlnet_helper.py b/diffsynth_engine/pipelines/controlnet_helper.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from typing import Dict, List, Tuple, Union, Optional
+from typing import List, Union, Optional
 from PIL import Image
 from dataclasses import dataclass
 
diff --git a/diffsynth_engine/pipelines/flux_image.py b/diffsynth_engine/pipelines/flux_image.py
@@ -2,12 +2,11 @@
 import os
 import json
 import torch
-import torch.nn as nn
 import torch.distributed as dist
 import math
 from einops import rearrange
 from enum import Enum
-from typing import Callable, Dict, List, Tuple, Optional, Union
+from typing import Callable, Dict, List, Tuple, Optional
 from tqdm import tqdm
 from PIL import Image
 from dataclasses import dataclass
diff --git a/diffsynth_engine/pipelines/sd_image.py b/diffsynth_engine/pipelines/sd_image.py
@@ -18,6 +18,7 @@
 from diffsynth_engine.algorithm.sampler import EulerSampler
 from diffsynth_engine.utils.prompt import tokenize_long_prompt
 from diffsynth_engine.utils.constants import SDXL_TOKENIZER_CONF_PATH
+from diffsynth_engine.utils.platform import empty_cache
 from diffsynth_engine.utils import logging
 
 logger = logging.get_logger(__name__)
diff --git a/diffsynth_engine/pipelines/sdxl_image.py b/diffsynth_engine/pipelines/sdxl_image.py
@@ -26,6 +26,7 @@
 from diffsynth_engine.algorithm.sampler import EulerSampler
 from diffsynth_engine.utils.prompt import tokenize_long_prompt
 from diffsynth_engine.utils.constants import SDXL_TOKENIZER_CONF_PATH, SDXL_TOKENIZER_2_CONF_PATH
+from diffsynth_engine.utils.platform import empty_cache
 from diffsynth_engine.utils import logging
 
 logger = logging.get_logger(__name__)
@@ -89,6 +90,8 @@ def _from_kohya(self, lora_state_dict: Dict[str, torch.Tensor]) -> Dict[str, Dic
                 unet_dict[key] = lora_args
             else:
                 raise ValueError(f"Unsupported key: {key}")
+        # clip skip
+        te1_dict = {k: v for k, v in te1_dict.items() if not k.startswith('encoders.11')}
         return {"unet": unet_dict, "text_encoder": te1_dict, "text_encoder_2": te2_dict}
 
     def convert(self, lora_state_dict: Dict[str, torch.Tensor]) -> Dict[str, Dict[str, torch.Tensor]]: