kohya-ss · kohya-ss · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024 · Nov 9, 2024
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,51 @@
+name: Test with pytest
+
+on: 
+  push:
+    branches:
+      - main
+      - dev
+      - sd3
+  pull_request:
+    branches:
+      - main
+      - dev
+      - sd3
+
+# CKV2_GHA_1: "Ensure top-level permissions are not set to write-all"
+permissions: read-all
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ["3.10"] # Python versions to test
+        pytorch-version: ["2.4.0"] # PyTorch versions to test
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          # https://woodruffw.github.io/zizmor/audits/#artipacked
+          persist-credentials: false
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip' 
+
+      - name: Install and update pip, setuptools, wheel
+        run: |
+          # Setuptools, wheel for compiling some packages
+          python -m pip install --upgrade pip setuptools wheel
+
+      - name: Install dependencies
+        run: |
+          # Pre-install torch to pin version (requirements.txt has dependencies like transformers which requires pytorch)
+          pip install dadaptation==3.2 torch==${{ matrix.pytorch-version }} torchvision pytest==8.3.4
+          pip install -r requirements.txt
+
+      - name: Test with pytest
+        run: pytest # See pytest.ini for configuration
+
diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml
@@ -1,21 +1,29 @@
 ---
-# yamllint disable rule:line-length
 name: Typos
 
-on:  # yamllint disable-line rule:truthy
+on: 
   push:
+    branches:
+      - main
+      - dev
   pull_request:
     types:
       - opened
       - synchronize
       - reopened
 
+# CKV2_GHA_1: "Ensure top-level permissions are not set to write-all"
+permissions: read-all
+
 jobs:
   build:
     runs-on: ubuntu-latest
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          # https://woodruffw.github.io/zizmor/audits/#artipacked
+          persist-credentials: false
 
       - name: typos-action
-        uses: crate-ci/typos@v1.24.3
+        uses: crate-ci/typos@v1.28.1
diff --git a/README.md b/README.md
diff --git a/docs/config_README-en.md b/docs/config_README-en.md
@@ -152,6 +152,7 @@ These options are related to subset configuration.
 | `keep_tokens_separator` | `“|||”` | o | o | o |
 | `secondary_separator` | `“;;;”` | o | o | o |
 | `enable_wildcard` | `true` | o | o | o |
+| `resize_interpolation` | (not specified) | o | o | o |
 
 * `num_repeats`
     * Specifies the number of repeats for images in a subset. This is equivalent to `--dataset_repeats` in fine-tuning but can be specified for any training method.
@@ -165,6 +166,8 @@ These options are related to subset configuration.
     * Specifies an additional separator. The part separated by this separator is treated as one tag and is shuffled and dropped. It is then replaced by `caption_separator`. For example, if you specify `aaa;;;bbb;;;ccc`, it will be replaced by `aaa,bbb,ccc` or dropped together.
 * `enable_wildcard`
     * Enables wildcard notation. This will be explained later.
+* `resize_interpolation`
+    * Specifies the interpolation method used when resizing images. Normally, there is no need to specify this. The following options can be specified: `lanczos`, `nearest`, `bilinear`, `linear`, `bicubic`, `cubic`, `area`, `box`. By default (when not specified), `area` is used for downscaling, and `lanczos` is used for upscaling. If this option is specified, the same interpolation method will be used for both upscaling and downscaling. When `lanczos` or `box` is specified, PIL is used; for other options, OpenCV is used.
 
 ### DreamBooth-specific options
 

diff --git a/docs/config_README-ja.md b/docs/config_README-ja.md
@@ -144,6 +144,7 @@ DreamBooth の手法と fine tuning の手法の両方とも利用可能な学
 | `keep_tokens_separator` | `“|||”` | o | o | o |
 | `secondary_separator` | `“;;;”` | o | o | o |
 | `enable_wildcard` | `true` | o | o | o |
+| `resize_interpolation` |（通常は設定しません） | o | o | o |
 
 * `num_repeats`
     * サブセットの画像の繰り返し回数を指定します。fine tuning における `--dataset_repeats` に相当しますが、`num_repeats` はどの学習方法でも指定可能です。
@@ -162,6 +163,9 @@ DreamBooth の手法と fine tuning の手法の両方とも利用可能な学
 * `enable_wildcard`
     * ワイルドカード記法および複数行キャプションを有効にします。ワイルドカード記法、複数行キャプションについては後述します。
 
+* `resize_interpolation`
+    * 画像のリサイズ時に使用する補間方法を指定します。通常は指定しなくて構いません。`lanczos`, `nearest`, `bilinear`, `linear`, `bicubic`, `cubic`, `area`, `box` が指定可能です。デフォルト（未指定時）は、縮小時は `area`、拡大時は `lanczos` になります。このオプションを指定すると、拡大時・縮小時とも同じ補間方法が使用されます。`lanczos`、`box`を指定するとPILが、それ以外を指定するとOpenCVが使用されます。
+
 ### DreamBooth 方式専用のオプション
 
 DreamBooth 方式のオプションは、サブセット向けオプションのみ存在します。

diff --git a/docs/train_lllite_README.md b/docs/train_lllite_README.md
@@ -185,7 +185,7 @@ for img_file in img_files:
 
 ### Creating a dataset configuration file
 
-You can use the command line arguments of `sdxl_train_control_net_lllite.py` to specify the conditioning image directory. However, if you want to use a `.toml` file, specify the conditioning image directory in `conditioning_data_dir`.
+You can use the command line argument `--conditioning_data_dir` of `sdxl_train_control_net_lllite.py` to specify the conditioning image directory. However, if you want to use a `.toml` file, specify the conditioning image directory in `conditioning_data_dir`.
 
 ```toml
 [general]

diff --git a/fine_tune.py b/fine_tune.py
@@ -10,7 +10,7 @@
 from tqdm import tqdm
 
 import torch
-from library import deepspeed_utils
+from library import deepspeed_utils, strategy_base
 from library.device_utils import init_ipex, clean_memory_on_device
 
 init_ipex()
@@ -39,6 +39,7 @@
     scale_v_prediction_loss_like_noise_prediction,
     apply_debiased_estimation,
 )
+import library.strategy_sd as strategy_sd
 
 
 def train(args):
@@ -52,7 +53,15 @@ def train(args):
     if args.seed is not None:
         set_seed(args.seed)  # 乱数系列を初期化する
 
-    tokenizer = train_util.load_tokenizer(args)
+    tokenize_strategy = strategy_sd.SdTokenizeStrategy(args.v2, args.max_token_length, args.tokenizer_cache_dir)
+    strategy_base.TokenizeStrategy.set_strategy(tokenize_strategy)
+
+    # prepare caching strategy: this must be set before preparing dataset. because dataset may use this strategy for initialization.
+    if cache_latents:
+        latents_caching_strategy = strategy_sd.SdSdxlLatentsCachingStrategy(
+            False, args.cache_latents_to_disk, args.vae_batch_size, args.skip_cache_check
+        )
+        strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
     # データセットを準備する
     if args.dataset_class is None:
@@ -81,10 +90,11 @@ def train(args):
                 ]
             }
 
-        blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizer)
-        train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+        blueprint = blueprint_generator.generate(user_config, args)
+        train_dataset_group, val_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
-        train_dataset_group = train_util.load_arbitrary_dataset(args, tokenizer)
+        train_dataset_group = train_util.load_arbitrary_dataset(args)
+        val_dataset_group = None
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
@@ -167,8 +177,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         vae.to(accelerator.device, dtype=vae_dtype)
         vae.requires_grad_(False)
         vae.eval()
-        with torch.no_grad():
-            train_dataset_group.cache_latents(vae, args.vae_batch_size, args.cache_latents_to_disk, accelerator.is_main_process)
+
+        train_dataset_group.new_cache_latents(vae, accelerator)
+
         vae.to("cpu")
         clean_memory_on_device(accelerator.device)
 
@@ -194,6 +205,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         else:
             text_encoder.eval()
 
+    text_encoding_strategy = strategy_sd.SdTextEncodingStrategy(args.clip_skip)
+    strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
+
     if not cache_latents:
         vae.requires_grad_(False)
         vae.eval()
@@ -216,7 +230,11 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
     accelerator.print("prepare optimizer, data loader etc.")
     _, _, optimizer = train_util.get_optimizer(args, trainable_params=trainable_params)
 
-    # dataloaderを準備する
+    # prepare dataloader
+    # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
+    # some strategies can be None
+    train_dataset_group.set_current_strategies()
+
     # DataLoaderのプロセス数：0 は persistent_workers が使えないので注意
     n_workers = min(args.max_data_loader_n_workers, os.cpu_count())  # cpu_count or max_data_loader_n_workers
     train_dataloader = torch.utils.data.DataLoader(
@@ -319,7 +337,12 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         )
 
     # For --sample_at_first
-    train_util.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
+    train_util.sample_images(
+        accelerator, args, 0, global_step, accelerator.device, vae, tokenize_strategy.tokenizer, text_encoder, unet
+    )
+    if len(accelerator.trackers) > 0:
+        # log empty object to commit the sample images to wandb
+        accelerator.log({}, step=0)
 
     loss_recorder = train_util.LossRecorder()
     for epoch in range(num_train_epochs):
@@ -344,25 +367,21 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                 with torch.set_grad_enabled(args.train_text_encoder):
                     # Get the text embedding for conditioning
                     if args.weighted_captions:
-                        encoder_hidden_states = get_weighted_text_embeddings(
-                            tokenizer,
-                            text_encoder,
-                            batch["captions"],
-                            accelerator.device,
-                            args.max_token_length // 75 if args.max_token_length else 1,
-                            clip_skip=args.clip_skip,
-                        )
+                        input_ids_list, weights_list = tokenize_strategy.tokenize_with_weights(batch["captions"])
+                        encoder_hidden_states = text_encoding_strategy.encode_tokens_with_weights(
+                            tokenize_strategy, [text_encoder], input_ids_list, weights_list
+                        )[0]
                     else:
-                        input_ids = batch["input_ids"].to(accelerator.device)
-                        encoder_hidden_states = train_util.get_hidden_states(
-                            args, input_ids, tokenizer, text_encoder, None if not args.full_fp16 else weight_dtype
-                        )
+                        input_ids = batch["input_ids_list"][0].to(accelerator.device)
+                        encoder_hidden_states = text_encoding_strategy.encode_tokens(
+                            tokenize_strategy, [text_encoder], [input_ids]
+                        )[0]
+                    if args.full_fp16:
+                        encoder_hidden_states = encoder_hidden_states.to(weight_dtype)
 
                 # Sample noise, sample a random timestep for each image, and add noise to the latents,
                 # with noise offset and/or multires noise if specified
-                noise, noisy_latents, timesteps, huber_c = train_util.get_noise_noisy_latents_and_timesteps(
-                    args, noise_scheduler, latents
-                )
+                noise, noisy_latents, timesteps = train_util.get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents)
 
                 # Predict the noise residual
                 with accelerator.autocast():
@@ -374,11 +393,10 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                 else:
                     target = noise
 
+                huber_c = train_util.get_huber_threshold_if_needed(args, timesteps, noise_scheduler)
                 if args.min_snr_gamma or args.scale_v_pred_loss_like_noise_pred or args.debiased_estimation_loss:
                     # do not mean over batch dimension for snr weight or scale v-pred loss
-                    loss = train_util.conditional_loss(
-                        noise_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=huber_c
-                    )
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "none", huber_c)
                     loss = loss.mean([1, 2, 3])
 
                     if args.min_snr_gamma:
@@ -390,9 +408,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
 
                     loss = loss.mean()  # mean over batch dimension
                 else:
-                    loss = train_util.conditional_loss(
-                        noise_pred.float(), target.float(), reduction="mean", loss_type=args.loss_type, huber_c=huber_c
-                    )
+                    loss = train_util.conditional_loss(noise_pred.float(), target.float(), args.loss_type, "mean", huber_c)
 
                 accelerator.backward(loss)
                 if accelerator.sync_gradients and args.max_grad_norm != 0.0:
@@ -411,7 +427,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                 global_step += 1
 
                 train_util.sample_images(
-                    accelerator, args, None, global_step, accelerator.device, vae, tokenizer, text_encoder, unet
+                    accelerator, args, None, global_step, accelerator.device, vae, tokenize_strategy.tokenizer, text_encoder, unet
                 )
 
                 # 指定ステップごとにモデルを保存
@@ -436,7 +452,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                         )
 
             current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
-            if args.logging_dir is not None:
+            if len(accelerator.trackers) > 0:
                 logs = {"loss": current_loss}
                 train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=True)
                 accelerator.log(logs, step=global_step)
@@ -449,7 +465,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             if global_step >= args.max_train_steps:
                 break
 
-        if args.logging_dir is not None:
+        if len(accelerator.trackers) > 0:
             logs = {"loss/epoch": loss_recorder.moving_average}
             accelerator.log(logs, step=epoch + 1)
 
@@ -474,7 +490,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                     vae,
                 )
 
-        train_util.sample_images(accelerator, args, epoch + 1, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
+        train_util.sample_images(
+            accelerator, args, epoch + 1, global_step, accelerator.device, vae, tokenize_strategy.tokenizer, text_encoder, unet
+        )
 
     is_main_process = accelerator.is_main_process
     if is_main_process:

diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
@@ -11,7 +11,7 @@
 from tqdm import tqdm
 
 import library.train_util as train_util
-from library.utils import setup_logging, pil_resize
+from library.utils import setup_logging, resize_image
 
 setup_logging()
 import logging
@@ -42,10 +42,7 @@ def preprocess_image(image):
     pad_t = pad_y // 2
     image = np.pad(image, ((pad_t, pad_y - pad_t), (pad_l, pad_x - pad_l), (0, 0)), mode="constant", constant_values=255)
 
-    if size > IMAGE_SIZE:
-        image = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE), cv2.INTER_AREA)
-    else:
-        image = pil_resize(image, (IMAGE_SIZE, IMAGE_SIZE))
+    image = resize_image(image, image.shape[0], image.shape[1], IMAGE_SIZE, IMAGE_SIZE)
 
     image = image.astype(np.float32)
     return image