Merge pull request #28 from raoulritter/main

arda-argmax · web-flow · commit bfbdd0eff6f9 · 2024-09-09T16:50:31.000-07:00
[Model Support] FLUX.1-dev
diff --git a/.flake8 b/.flake8
diff --git a/.gitignore b/.gitignore
@@ -27,6 +27,7 @@ __pycache__/
 # Distribution / packaging
 .Python
 build/
+.build/
 develop-eggs/
 dist/
 downloads/
diff --git a/README.md b/README.md
@@ -32,7 +32,11 @@ pip install -e .
   <summary> Click to expand </summary>
 
 
-[Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium) requires users to accept the terms before downloading the checkpoint. Once you accept the terms, sign in with your Hugging Face hub READ token as below:
+[Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium) requires users to accept the terms before downloading the checkpoint.
+
+[FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) also requires users to accept the terms before downloading the checkpoint.
+
+Once you accept the terms, sign in with your Hugging Face hub READ token as below:
 > [!IMPORTANT]
 > If using a fine-grained token, it is also necessary to [edit permissions](https://huggingface.co/settings/tokens) to allow `Read access to contents of all public gated repos you can access`
 
@@ -89,6 +93,8 @@ Some notable optional arguments for:
 
 Please refer to the help menu for all available arguments: `diffusionkit-cli -h`.
 
+Note: When using `FLUX.1-dev`, verify you've accepted the [FLUX.1-dev licence](https://huggingface.co/black-forest-labs/FLUX.1-dev) and have allowed gated access on your [HuggingFace token](https://huggingface.co/settings/tokens)
+
 ### Code ###
 
 For Stable Diffusion 3:
@@ -109,7 +115,7 @@ For FLUX:
 from diffusionkit.mlx import FluxPipeline
 pipeline = FluxPipeline(
   shift=1.0,
-  model_version="argmaxinc/mlx-FLUX.1-schnell",
+  model_version="argmaxinc/mlx-FLUX.1-schnell", # model_version="argmaxinc/mlx-FLUX.1-dev" for FLUX.1-dev
   low_memory_mode=True,
   a16=True,
   w16=True,
@@ -120,7 +126,7 @@ Finally, to generate the image, use the `generate_image()` function:
 ```python
 HEIGHT = 512
 WIDTH = 512
-NUM_STEPS = 4  #  4 for FLUX.1-schnell, 50 for SD3
+NUM_STEPS = 4  #  4 for FLUX.1-schnell, 50 for SD3 and FLUX.1-dev
 CFG_WEIGHT = 0. # for FLUX.1-schnell, 5. for SD3
 
 image, _ = pipeline.generate_image(
diff --git a/python/src/diffusionkit/mlx/__init__.py b/python/src/diffusionkit/mlx/__init__.py
@@ -39,12 +39,14 @@
     "sd3-8b-unreleased": "models/sd3_8b_beta.safetensors",  # unreleased
     "argmaxinc/mlx-FLUX.1-schnell": "argmaxinc/mlx-FLUX.1-schnell",
     "argmaxinc/mlx-FLUX.1-schnell-4bit-quantized": "argmaxinc/mlx-FLUX.1-schnell-4bit-quantized",
+    "argmaxinc/mlx-FLUX.1-dev": "argmaxinc/mlx-FLUX.1-dev",
 }
 
 T5_MAX_LENGTH = {
     "argmaxinc/mlx-stable-diffusion-3-medium": 512,
     "argmaxinc/mlx-FLUX.1-schnell": 256,
     "argmaxinc/mlx-FLUX.1-schnell-4bit-quantized": 256,
+    "argmaxinc/mlx-FLUX.1-dev": 512,
 }
 
 
@@ -653,7 +655,9 @@ def encode_text(
             text,
             (negative_text if cfg_weight > 1 else None),
         )
-        padded_tokens_t5 = mx.zeros((1, 256)).astype(tokens_t5.dtype)
+        padded_tokens_t5 = mx.zeros((1, T5_MAX_LENGTH[self.model_version])).astype(
+            tokens_t5.dtype
+        )
         padded_tokens_t5[:, : tokens_t5.shape[1]] = tokens_t5[
             [0], :
         ]  # Ignore negative text
diff --git a/python/src/diffusionkit/mlx/config.py b/python/src/diffusionkit/mlx/config.py
@@ -68,6 +68,8 @@ def hidden_size(self) -> int:
 
     low_memory_mode: bool = True
 
+    guidance_embed: bool = False
+
 
 SD3_8b = MMDiTConfig(depth_multimodal=38, num_heads=3, upcast_multimodal_blocks=[35])
 
@@ -90,6 +92,22 @@ def hidden_size(self) -> int:
     dtype=mx.bfloat16,
 )
 
+FLUX_DEV = MMDiTConfig(
+    num_heads=24,
+    depth_multimodal=19,
+    depth_unified=38,
+    parallel_mlp_for_unified_blocks=True,
+    hidden_size_override=3072,
+    patchify_via_reshape=True,
+    pos_embed_type=PositionalEncoding.PreSDPARope,
+    rope_axes_dim=(16, 56, 56),
+    pooled_text_embed_dim=768,  # CLIP-L/14 only
+    use_qk_norm=True,
+    float16_dtype=mx.bfloat16,
+    guidance_embed=True,
+    dtype=mx.bfloat16,
+)
+
 
 @dataclass
 class AutoencoderConfig:
diff --git a/python/src/diffusionkit/mlx/mmdit.py b/python/src/diffusionkit/mlx/mmdit.py
@@ -28,6 +28,13 @@ def __init__(self, config: MMDiTConfig):
         super().__init__()
         self.config = config
 
+        if config.guidance_embed:
+            self.guidance_in = MLPEmbedder(
+                in_dim=config.frequency_embed_dim, hidden_dim=config.hidden_size
+            )
+        else:
+            self.guidance_in = nn.Identity()
+
         # Input adapters and embeddings
         self.x_embedder = LatentImageAdapter(config)
 
@@ -209,6 +216,9 @@ def __call__(
         else:
             positional_encodings = None
 
+        if self.config.guidance_embed:
+            timestep = self.guidance_in(self.t_embedder(timestep))
+
         # MultiModalTransformer layers
         if self.config.depth_multimodal > 0:
             for bidx, block in enumerate(self.multimodal_transformer_blocks):
@@ -236,7 +246,6 @@ def __call__(
                 :, token_level_text_embeddings.shape[1] :, ...
             ]
 
-        # Final layer
         latent_image_embeddings = self.final_layer(
             latent_image_embeddings,
             timestep,
@@ -933,6 +942,19 @@ def apply(q_or_k: mx.array, rope: mx.array) -> mx.array:
         )
 
 
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(in_dim, hidden_dim),
+            nn.SiLU(),
+            nn.Linear(hidden_dim, hidden_dim),
+        )
+
+    def __call__(self, x):
+        return self.mlp(x)
+
+
 def affine_transform(
     x: mx.array,
     shift: mx.array,
diff --git a/python/src/diffusionkit/mlx/model_io.py b/python/src/diffusionkit/mlx/model_io.py
@@ -46,6 +46,10 @@
         "argmaxinc/mlx-FLUX.1-schnell-4bit-quantized": "flux-schnell-4bit-quantized.safetensors",
         "vae": "ae.safetensors",
     },
+    "argmaxinc/mlx-FLUX.1-dev": {
+        "argmaxinc/mlx-FLUX.1-dev": "flux1-dev.safetensors",
+        "vae": "ae.safetensors",
+    },
 }
 _DEFAULT_MODEL = "argmaxinc/stable-diffusion"
 _MODELS = {
@@ -75,6 +79,10 @@
         "vae_encoder": "encoder.",
         "vae_decoder": "decoder.",
     },
+    "argmaxinc/mlx-FLUX.1-dev": {
+        "vae_encoder": "encoder.",
+        "vae_decoder": "decoder.",
+    },
 }
 
 _FLOAT16 = mx.bfloat16
@@ -704,7 +712,7 @@ def load_flux(
     hf_hub_download(key, "config.json")
     weights = mx.load(flux_weights_ckpt)
 
-    if model_key == "argmaxinc/mlx-FLUX.1-schnell":
+    if model_key in ["argmaxinc/mlx-FLUX.1-schnell", "argmaxinc/mlx-FLUX.1-dev"]:
         weights = flux_state_dict_adjustments(
             weights,
             prefix="",
diff --git a/python/src/diffusionkit/mlx/scripts/generate_images.py b/python/src/diffusionkit/mlx/scripts/generate_images.py
@@ -17,18 +17,21 @@
     "sd3-8b-unreleased": 1024,
     "argmaxinc/mlx-FLUX.1-schnell": 512,
     "argmaxinc/mlx-FLUX.1-schnell-4bit-quantized": 512,
+    "argmaxinc/mlx-FLUX.1-dev": 512,
 }
 WIDTH = {
     "argmaxinc/mlx-stable-diffusion-3-medium": 512,
     "sd3-8b-unreleased": 1024,
     "argmaxinc/mlx-FLUX.1-schnell": 512,
     "argmaxinc/mlx-FLUX.1-schnell-4bit-quantized": 512,
+    "argmaxinc/mlx-FLUX.1-dev": 512,
 }
 SHIFT = {
     "argmaxinc/mlx-stable-diffusion-3-medium": 3.0,
     "sd3-8b-unreleased": 3.0,
     "argmaxinc/mlx-FLUX.1-schnell": 1.0,
     "argmaxinc/mlx-FLUX.1-schnell-4bit-quantized": 1.0,
+    "argmaxinc/mlx-FLUX.1-dev": 1.0,
 }
 
 
@@ -111,7 +114,7 @@ def cli():
     args.a16 = True
 
     if "FLUX" in args.model_version and args.cfg > 0.0:
-        logger.warning("Disabling CFG for FLUX.1-schnell model.")
+        logger.warning(f"Disabling CFG for {args.model_version} model.")
         args.cfg = 0.0
 
     if args.benchmark_mode:
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 from setuptools import find_packages, setup
 from setuptools.command.install import install
 
-VERSION = "0.3.5"
+VERSION = "0.4.0"
 
 
 class VersionInstallCommand(install):
@@ -29,7 +29,7 @@ def run(self):
         "argmaxtools>=0.1.13",
         "torch",
         "safetensors",
-        "mlx>=0.16.3",
+        "mlx>=0.17.1",
         "jaxtyping",
         "transformers",
         "pillow",