kohya-ss · sdbds · Feb 12, 2025 · Feb 15, 2025 · Feb 15, 2025 · Feb 15, 2025
diff --git a/library/config_util.py b/library/config_util.py
@@ -75,6 +75,7 @@ class BaseSubsetParams:
     custom_attributes: Optional[Dict[str, Any]] = None
     validation_seed: int = 0
     validation_split: float = 0.0
+    system_prompt: Optional[str] = None
     resize_interpolation: Optional[str] = None
 
 
@@ -107,6 +108,7 @@ class BaseDatasetParams:
     debug_dataset: bool = False
     validation_seed: Optional[int] = None
     validation_split: float = 0.0
+    system_prompt: Optional[str] = None
     resize_interpolation: Optional[str] = None
 
 @dataclass
@@ -197,6 +199,7 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
         "caption_prefix": str,
         "caption_suffix": str,
         "custom_attributes": dict,
+        "system_prompt": str,
         "resize_interpolation": str,
     }
     # DO means DropOut
@@ -243,6 +246,7 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
         "validation_split": float,
         "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
         "network_multiplier": float,
+        "system_prompt": str,
         "resize_interpolation": str,
     }
 
@@ -530,6 +534,7 @@ def print_info(_datasets, dataset_type: str):
                   resolution: {(dataset.width, dataset.height)}
                   resize_interpolation: {dataset.resize_interpolation}
                   enable_bucket: {dataset.enable_bucket}
+                  system_prompt: {dataset.system_prompt}
             """)
 
             if dataset.enable_bucket:
@@ -564,6 +569,7 @@ def print_info(_datasets, dataset_type: str):
                     alpha_mask: {subset.alpha_mask}
                     resize_interpolation: {subset.resize_interpolation}
                     custom_attributes: {subset.custom_attributes}
+                    system_prompt: {subset.system_prompt}
                 """), "  ")
 
                 if is_dreambooth:

diff --git a/library/custom_offloading_utils.py b/library/custom_offloading_utils.py
@@ -1,6 +1,6 @@
 from concurrent.futures import ThreadPoolExecutor
 import time
-from typing import Optional
+from typing import Optional, Union, Callable, Tuple
 import torch
 import torch.nn as nn
 
@@ -19,7 +19,7 @@ def synchronize_device(device: torch.device):
 def swap_weight_devices_cuda(device: torch.device, layer_to_cpu: nn.Module, layer_to_cuda: nn.Module):
     assert layer_to_cpu.__class__ == layer_to_cuda.__class__
 
-    weight_swap_jobs = []
+    weight_swap_jobs: list[Tuple[nn.Module, nn.Module, torch.Tensor, torch.Tensor]] = []
 
     # This is not working for all cases (e.g. SD3), so we need to find the corresponding modules
     # for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
@@ -42,7 +42,7 @@ def swap_weight_devices_cuda(device: torch.device, layer_to_cpu: nn.Module, laye
 
     torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value
 
-    stream = torch.cuda.Stream()
+    stream = torch.Stream(device="cuda")
     with torch.cuda.stream(stream):
         # cuda to cpu
         for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
@@ -66,23 +66,24 @@ def swap_weight_devices_no_cuda(device: torch.device, layer_to_cpu: nn.Module, l
     """
     assert layer_to_cpu.__class__ == layer_to_cuda.__class__
 
-    weight_swap_jobs = []
+    weight_swap_jobs: list[Tuple[nn.Module, nn.Module, torch.Tensor, torch.Tensor]] = []
     for module_to_cpu, module_to_cuda in zip(layer_to_cpu.modules(), layer_to_cuda.modules()):
         if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
             weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
 
+
     # device to cpu
     for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
         module_to_cpu.weight.data = cuda_data_view.data.to("cpu", non_blocking=True)
 
-    synchronize_device()
+    synchronize_device(device)
 
     # cpu to device
     for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
         cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
         module_to_cuda.weight.data = cuda_data_view
 
-    synchronize_device()
+    synchronize_device(device)
 
 
 def weighs_to_device(layer: nn.Module, device: torch.device):
@@ -148,13 +149,16 @@ def _wait_blocks_move(self, block_idx):
             print(f"Waited for block {block_idx}: {time.perf_counter()-start_time:.2f}s")
 
 
+# Gradient tensors
+_grad_t = Union[tuple[torch.Tensor, ...], torch.Tensor]
+
 class ModelOffloader(Offloader):
     """
     supports forward offloading
     """
 
-    def __init__(self, blocks: list[nn.Module], num_blocks: int, blocks_to_swap: int, device: torch.device, debug: bool = False):
-        super().__init__(num_blocks, blocks_to_swap, device, debug)
+    def __init__(self, blocks: Union[list[nn.Module], nn.ModuleList], blocks_to_swap: int, device: torch.device, debug: bool = False):
+        super().__init__(len(blocks), blocks_to_swap, device, debug)
 
         # register backward hooks
         self.remove_handles = []
@@ -168,7 +172,7 @@ def __del__(self):
         for handle in self.remove_handles:
             handle.remove()
 
-    def create_backward_hook(self, blocks: list[nn.Module], block_index: int) -> Optional[callable]:
+    def create_backward_hook(self, blocks: Union[list[nn.Module], nn.ModuleList], block_index: int) -> Optional[Callable[[nn.Module, _grad_t, _grad_t], Union[None, _grad_t]]]:
         # -1 for 0-based index
         num_blocks_propagated = self.num_blocks - block_index - 1
         swapping = num_blocks_propagated > 0 and num_blocks_propagated <= self.blocks_to_swap
@@ -182,7 +186,7 @@ def create_backward_hook(self, blocks: list[nn.Module], block_index: int) -> Opt
         block_idx_to_cuda = self.blocks_to_swap - num_blocks_propagated
         block_idx_to_wait = block_index - 1
 
-        def backward_hook(module, grad_input, grad_output):
+        def backward_hook(module: nn.Module, grad_input: _grad_t, grad_output: _grad_t):
             if self.debug:
                 print(f"Backward hook for block {block_index}")
 
@@ -194,7 +198,7 @@ def backward_hook(module, grad_input, grad_output):
 
         return backward_hook
 
-    def prepare_block_devices_before_forward(self, blocks: list[nn.Module]):
+    def prepare_block_devices_before_forward(self, blocks: Union[list[nn.Module], nn.ModuleList]):
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
             return
 
@@ -207,7 +211,7 @@ def prepare_block_devices_before_forward(self, blocks: list[nn.Module]):
 
         for b in blocks[self.num_blocks - self.blocks_to_swap :]:
             b.to(self.device)  # move block to device first
-            weighs_to_device(b, "cpu")  # make sure weights are on cpu
+            weighs_to_device(b, torch.device("cpu"))  # make sure weights are on cpu
 
         synchronize_device(self.device)
         clean_memory_on_device(self.device)
@@ -217,7 +221,7 @@ def wait_for_block(self, block_idx: int):
             return
         self._wait_blocks_move(block_idx)
 
-    def submit_move_blocks(self, blocks: list[nn.Module], block_idx: int):
+    def submit_move_blocks(self, blocks: Union[list[nn.Module], nn.ModuleList], block_idx: int):
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
             return
         if block_idx >= self.blocks_to_swap:

diff --git a/library/flux_models.py b/library/flux_models.py
@@ -977,10 +977,10 @@ def enable_block_swap(self, num_blocks: int, device: torch.device):
         )
 
         self.offloader_double = custom_offloading_utils.ModelOffloader(
-            self.double_blocks, self.num_double_blocks, double_blocks_to_swap, device  # , debug=True
+            self.double_blocks, double_blocks_to_swap, device  # , debug=True
         )
         self.offloader_single = custom_offloading_utils.ModelOffloader(
-            self.single_blocks, self.num_single_blocks, single_blocks_to_swap, device  # , debug=True
+            self.single_blocks, single_blocks_to_swap, device  # , debug=True
         )
         print(
             f"FLUX: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
@@ -1219,10 +1219,10 @@ def enable_block_swap(self, num_blocks: int, device: torch.device):
         )
 
         self.offloader_double = custom_offloading_utils.ModelOffloader(
-            self.double_blocks, self.num_double_blocks, double_blocks_to_swap, device  # , debug=True
+            self.double_blocks, double_blocks_to_swap, device  # , debug=True
         )
         self.offloader_single = custom_offloading_utils.ModelOffloader(
-            self.single_blocks, self.num_single_blocks, single_blocks_to_swap, device  # , debug=True
+            self.single_blocks,  single_blocks_to_swap, device  # , debug=True
         )
         print(
             f"FLUX: Block swap enabled. Swapping {num_blocks} blocks, double blocks: {double_blocks_to_swap}, single blocks: {single_blocks_to_swap}."
@@ -1233,8 +1233,8 @@ def move_to_device_except_swap_blocks(self, device: torch.device):
         if self.blocks_to_swap:
             save_double_blocks = self.double_blocks
             save_single_blocks = self.single_blocks
-            self.double_blocks = None
-            self.single_blocks = None
+            self.double_blocks = nn.ModuleList()
+            self.single_blocks = nn.ModuleList()
 
         self.to(device)