Select dev/schnell based on state dict, use correct max seq len based on dev/schnell, and shift in inference, separate vae flux params into separate config

brandonrising · brandonrising · commit 1734dda754fe · 2024-08-20T22:42:51.000-04:00
diff --git a/invokeai/app/invocations/flux_text_encoder.py b/invokeai/app/invocations/flux_text_encoder.py
@@ -1,4 +1,5 @@
 import torch
+from typing import Literal
 from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5Tokenizer
 
 from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
@@ -23,11 +24,12 @@ class FluxTextEncoderInvocation(BaseInvocation):
         description=FieldDescriptions.clip,
         input=Input.Connection,
     )
-    t5Encoder: T5EncoderField = InputField(
+    t5_encoder: T5EncoderField = InputField(
         title="T5Encoder",
         description=FieldDescriptions.t5Encoder,
         input=Input.Connection,
     )
+    max_seq_len: Literal[256, 512] = InputField(description="Max sequence length for the desired flux model")
     positive_prompt: str = InputField(description="Positive prompt for text-to-image generation.")
 
     # TODO(ryand): Should we create a new return type for this invocation? This ConditioningOutput is clearly not
@@ -43,21 +45,15 @@ def invoke(self, context: InvocationContext) -> ConditioningOutput:
         return ConditioningOutput.build(conditioning_name)
 
     def _encode_prompt(self, context: InvocationContext) -> tuple[torch.Tensor, torch.Tensor]:
-        # TODO: Determine the T5 max sequence length based on the model.
-        # if self.model == "flux-schnell":
-        max_seq_len = 256
-        # # elif self.model == "flux-dev":
-        # #     max_seq_len = 512
-        # else:
-        #     raise ValueError(f"Unknown model: {self.model}")
+        max_seq_len = self.max_seq_len
 
         # Load CLIP.
         clip_tokenizer_info = context.models.load(self.clip.tokenizer)
         clip_text_encoder_info = context.models.load(self.clip.text_encoder)
 
         # Load T5.
-        t5_tokenizer_info = context.models.load(self.t5Encoder.tokenizer)
-        t5_text_encoder_info = context.models.load(self.t5Encoder.text_encoder)
+        t5_tokenizer_info = context.models.load(self.t5_encoder.tokenizer)
+        t5_text_encoder_info = context.models.load(self.t5_encoder.text_encoder)
 
         with (
             clip_text_encoder_info as clip_text_encoder,
diff --git a/invokeai/app/invocations/flux_text_to_image.py b/invokeai/app/invocations/flux_text_to_image.py
@@ -19,6 +19,7 @@
 from invokeai.backend.flux.sampling import denoise, get_noise, get_schedule, unpack
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import FLUXConditioningInfo
 from invokeai.backend.util.devices import TorchDevice
+from invokeai.backend.model_manager.config import CheckpointConfigBase
 
 
 @invocation(
@@ -89,7 +90,7 @@ def _run_diffusion(
         img, img_ids = self._prepare_latent_img_patches(x)
 
         # HACK(ryand): Find a better way to determine if this is a schnell model or not.
-        is_schnell = "schnell" in transformer_info.config.path if transformer_info.config else ""
+        is_schnell = "schnell" in transformer_info.config.config_path if transformer_info.config and isinstance(transformer_info.config, CheckpointConfigBase) else ""
         timesteps = get_schedule(
             num_steps=self.num_steps,
             image_seq_len=img.shape[1],
diff --git a/invokeai/app/invocations/model.py b/invokeai/app/invocations/model.py
@@ -1,4 +1,5 @@
 import copy
+import yaml
 from time import sleep
 from typing import Dict, List, Literal, Optional
 
@@ -16,6 +17,7 @@
 from invokeai.app.services.shared.invocation_context import InvocationContext
 from invokeai.app.shared.models import FreeUConfig
 from invokeai.backend.model_manager.config import AnyModelConfig, BaseModelType, ModelFormat, ModelType, SubModelType
+from invokeai.backend.model_manager.config import CheckpointConfigBase
 
 
 class ModelIdentifierField(BaseModel):
@@ -154,8 +156,9 @@ class FluxModelLoaderOutput(BaseInvocationOutput):
 
     transformer: TransformerField = OutputField(description=FieldDescriptions.transformer, title="Transformer")
     clip: CLIPField = OutputField(description=FieldDescriptions.clip, title="CLIP")
-    t5Encoder: T5EncoderField = OutputField(description=FieldDescriptions.t5Encoder, title="T5 Encoder")
+    t5_encoder: T5EncoderField = OutputField(description=FieldDescriptions.t5Encoder, title="T5 Encoder")
     vae: VAEField = OutputField(description=FieldDescriptions.vae, title="VAE")
+    max_seq_len: Literal[256, 512] = OutputField(description=FieldDescriptions.vae, title="Max Seq Length")
 
 
 @invocation("flux_model_loader", title="Flux Main Model", tags=["model", "flux"], category="model", version="1.0.3")
@@ -189,12 +192,22 @@ def invoke(self, context: InvocationContext) -> FluxModelLoaderOutput:
             ModelType.VAE,
             BaseModelType.Flux,
         )
+        transformer_config = context.models.get_config(transformer)
+        assert isinstance(transformer_config, CheckpointConfigBase)
+        legacy_config_path = context.config.get().legacy_conf_path / transformer_config.config_path
+        config_path = legacy_config_path.as_posix()
+        with open(config_path, "r") as stream:
+            try:
+                flux_conf = yaml.safe_load(stream)
+            except:
+                raise
 
         return FluxModelLoaderOutput(
             transformer=TransformerField(transformer=transformer),
             clip=CLIPField(tokenizer=tokenizer, text_encoder=clip_encoder, loras=[], skipped_layers=0),
-            t5Encoder=T5EncoderField(tokenizer=tokenizer2, text_encoder=t5_encoder),
+            t5_encoder=T5EncoderField(tokenizer=tokenizer2, text_encoder=t5_encoder),
             vae=VAEField(vae=vae),
+            max_seq_len=flux_conf['max_seq_len']
         )
 
     def _get_model(self, context: InvocationContext, submodel: SubModelType) -> ModelIdentifierField:
diff --git a/invokeai/backend/model_manager/load/model_loaders/flux.py b/invokeai/backend/model_manager/load/model_loaders/flux.py
@@ -32,7 +32,6 @@
 )
 from invokeai.backend.model_manager.load.model_loader_registry import ModelLoaderRegistry
 from invokeai.backend.model_manager.load.model_loaders.generic_diffusers import GenericDiffusersLoader
-from invokeai.backend.util.devices import TorchDevice
 from invokeai.backend.util.silence_warnings import SilenceWarnings
 from invokeai.backend.quantization.bnb_nf4 import quantize_model_nf4
 
@@ -60,7 +59,7 @@ def _load_model(
                     raise
 
             dataclass_fields = {f.name for f in fields(AutoEncoderParams)}
-            filtered_data = {k: v for k, v in flux_conf["params"]["ae_params"].items() if k in dataclass_fields}
+            filtered_data = {k: v for k, v in flux_conf["params"].items() if k in dataclass_fields}
             params = AutoEncoderParams(**filtered_data)
 
             with SilenceWarnings():
diff --git a/invokeai/backend/model_manager/probe.py b/invokeai/backend/model_manager/probe.py
@@ -324,7 +324,12 @@ def _get_checkpoint_config_path(
         if model_type is ModelType.Main:
             if base_type == BaseModelType.Flux:
                 # TODO: Decide between dev/schnell
-                config_file = "flux/flux1-schnell.yaml"
+                checkpoint = ModelProbe._scan_and_load_checkpoint(model_path)
+                state_dict = checkpoint.get("state_dict") or checkpoint
+                if 'guidance_in.out_layer.weight' in state_dict:
+                    config_file = "flux/flux1-dev.yaml"
+                else:
+                    config_file = "flux/flux1-schnell.yaml"
             else:
                 config_file = LEGACY_CONFIGS[base_type][variant_type]
                 if isinstance(config_file, dict):  # need another tier for sd-2.x models
@@ -338,7 +343,7 @@ def _get_checkpoint_config_path(
             )
         elif model_type is ModelType.VAE:
             config_file = (
-                "flux/flux1-schnell.yaml"
+                "flux/flux1-vae.yaml"
                 if base_type is BaseModelType.Flux
                 else "stable-diffusion/v1-inference.yaml"
                 if base_type is BaseModelType.StableDiffusion1
diff --git a/invokeai/configs/flux/flux1-dev.yaml b/invokeai/configs/flux/flux1-dev.yaml
@@ -1,6 +1,6 @@
 repo_id: "black-forest-labs/FLUX.1-dev"
 repo_ae: "ae.safetensors"
-max_length: 512
+max_seq_len: 512
 params:
   in_channels: 64
   vec_in_dim: 768
@@ -17,17 +17,3 @@ params:
   theta: 10_000
   qkv_bias: True
   guidance_embed: True
-  ae_params:
-    resolution: 256
-    in_channels: 3
-    ch: 128
-    out_ch: 3
-    ch_mult:
-    - 1
-    - 2
-    - 4
-    - 4
-    num_res_blocks: 2
-    z_channels: 16
-    scale_factor: 0.3611
-    shift_factor: 0.1159
diff --git a/invokeai/configs/flux/flux1-schnell.yaml b/invokeai/configs/flux/flux1-schnell.yaml
@@ -1,7 +1,6 @@
 repo_id: "black-forest-labs/FLUX.1-schnell"
 repo_ae: "ae.safetensors"
-t5_encoder: "google/t5-v1_1-xxl"
-max_length: 512
+max_seq_len: 256
 params:
   in_channels: 64
   vec_in_dim: 768
@@ -18,17 +17,3 @@ params:
   theta: 10_000
   qkv_bias: True
   guidance_embed: False
-  ae_params:
-    resolution: 256
-    in_channels: 3
-    ch: 128
-    out_ch: 3
-    ch_mult:
-    - 1
-    - 2
-    - 4
-    - 4
-    num_res_blocks: 2
-    z_channels: 16
-    scale_factor: 0.3611
-    shift_factor: 0.1159
diff --git a/invokeai/configs/flux/flux1-vae.yaml b/invokeai/configs/flux/flux1-vae.yaml
@@ -0,0 +1,16 @@
+repo_id: "black-forest-labs/FLUX.1-schnell"
+repo_path: "ae.safetensors"
+params:
+  resolution: 256
+  in_channels: 3
+  ch: 128
+  out_ch: 3
+  ch_mult:
+  - 1
+  - 2
+  - 4
+  - 4
+  num_res_blocks: 2
+  z_channels: 16
+  scale_factor: 0.3611
+  shift_factor: 0.1159
diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts