huggingface
diff --git a/‎src/diffusers/hooks/_helpers.py
Lines changed: 8 additions & 0 deletions b/‎src/diffusers/hooks/_helpers.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/diffusers/modular_pipelines/flux/__init__.py b/‎src/diffusers/modular_pipelines/flux/__init__.py
diff --git a/‎src/diffusers/modular_pipelines/flux/before_denoise.py
Lines changed: 389 additions & 0 deletions b/‎src/diffusers/modular_pipelines/flux/before_denoise.py
Lines changed: 389 additions & 0 deletions
diff --git a/‎src/diffusers/modular_pipelines/flux/decoders.py
Lines changed: 115 additions & 0 deletions b/‎src/diffusers/modular_pipelines/flux/decoders.py
Lines changed: 115 additions & 0 deletions
diff --git a/‎src/diffusers/modular_pipelines/flux/denoise.py b/‎src/diffusers/modular_pipelines/flux/denoise.py
diff --git a/‎src/diffusers/modular_pipelines/flux/encoders.py b/‎src/diffusers/modular_pipelines/flux/encoders.py
diff --git a/‎src/diffusers/modular_pipelines/flux/modular_blocks.py b/‎src/diffusers/modular_pipelines/flux/modular_blocks.py
diff --git a/‎src/diffusers/modular_pipelines/flux/modular_pipeline.py b/‎src/diffusers/modular_pipelines/flux/modular_pipeline.py
diff --git a/‎src/diffusers/modular_pipelines/modular_pipeline.py
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/modular_pipelines/modular_pipeline.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/pipelines/flux/pipeline_output.py
Lines changed: 6 additions & 4 deletions b/‎src/diffusers/pipelines/flux/pipeline_output.py
Lines changed: 6 additions & 4 deletions
@@ -107,6 +107,7 @@ def _register(cls):
 def _register_attention_processors_metadata():
     from ..models.attention_processor import AttnProcessor2_0
     from ..models.transformers.transformer_cogview4 import CogView4AttnProcessor
+    from ..models.transformers.transformer_flux import FluxAttnProcessor
     from ..models.transformers.transformer_wan import WanAttnProcessor2_0
 
     # AttnProcessor2_0
@@ -132,6 +133,11 @@ def _register_attention_processors_metadata():
             skip_processor_output_fn=_skip_proc_output_fn_Attention_WanAttnProcessor2_0,
         ),
     )
+    # FluxAttnProcessor
+    AttentionProcessorRegistry.register(
+        model_class=FluxAttnProcessor,
+        metadata=AttentionProcessorMetadata(skip_processor_output_fn=_skip_proc_output_fn_Attention_FluxAttnProcessor),
+    )
 
 
 def _register_transformer_blocks_metadata():
@@ -271,4 +277,6 @@ def _skip_attention___ret___hidden_states___encoder_hidden_states(self, *args, *
 _skip_proc_output_fn_Attention_AttnProcessor2_0 = _skip_attention___ret___hidden_states
 _skip_proc_output_fn_Attention_CogView4AttnProcessor = _skip_attention___ret___hidden_states___encoder_hidden_states
 _skip_proc_output_fn_Attention_WanAttnProcessor2_0 = _skip_attention___ret___hidden_states
+# not sure what this is yet.
+_skip_proc_output_fn_Attention_FluxAttnProcessor = _skip_attention___ret___hidden_states
 # fmt: on
@@ -0,0 +1,115 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, List, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKL
+from ...utils import logging
+from ...video_processor import VaeImageProcessor
+from ..modular_pipeline import PipelineBlock, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.flux.pipeline_flux._unpack_latents
+def _unpack_latents(latents, height, width, vae_scale_factor):
+    batch_size, num_patches, channels = latents.shape
+
+    # VAE applies 8x compression on images but we must also account for packing which requires
+    # latent height and width to be divisible by 2.
+    height = 2 * (int(height) // (vae_scale_factor * 2))
+    width = 2 * (int(width) // (vae_scale_factor * 2))
+
+    latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+    latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+    latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+
+    return latents
+
+
+class FluxDecodeStep(PipelineBlock):
+    model_name = "flux"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKL),
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 16}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def description(self) -> str:
+        return "Step that decodes the denoised latents into images"
+
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("output_type", default="pil"),
+            InputParam("height", default=1024),
+            InputParam("width", default=1024),
+        ]
+
+    @property
+    def intermediate_inputs(self) -> List[str]:
+        return [
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised latents from the denoising step",
+            )
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[str]:
+        return [
+            OutputParam(
+                "images",
+                type_hint=Union[List[PIL.Image.Image], torch.Tensor, np.ndarray],
+                description="The generated images, can be a list of PIL.Image.Image, torch.Tensor or a numpy array",
+            )
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        vae = components.vae
+
+        if not block_state.output_type == "latent":
+            latents = block_state.latents
+            latents = _unpack_latents(latents, block_state.height, block_state.width, components.vae_scale_factor)
+            latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor
+            block_state.images = vae.decode(latents, return_dict=False)[0]
+            block_state.images = components.image_processor.postprocess(
+                block_state.images, output_type=block_state.output_type
+            )
+        else:
+            block_state.images = block_state.latents
+
+        self.set_block_state(state, block_state)
+
+        return components, state
@@ -61,13 +61,15 @@
     [
         ("stable-diffusion-xl", "StableDiffusionXLModularPipeline"),
         ("wan", "WanModularPipeline"),
+        ("flux", "FluxModularPipeline"),
     ]
 )
 
 MODULAR_PIPELINE_BLOCKS_MAPPING = OrderedDict(
     [
         ("StableDiffusionXLModularPipeline", "StableDiffusionXLAutoBlocks"),
         ("WanModularPipeline", "WanAutoBlocks"),
+        ("FluxModularPipeline", "FluxAutoBlocks"),
     ]
 )
 
 
@@ -11,12 +11,14 @@
 @dataclass
 class FluxPipelineOutput(BaseOutput):
     """
-    Output class for Stable Diffusion pipelines.
+    Output class for Flux image generation pipelines.
 
     Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
-            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+        images (`List[PIL.Image.Image]` or `torch.Tensor` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size,
+            height, width, num_channels)`. PIL images or numpy array present the denoised images of the diffusion
+            pipeline. Torch tensors can represent either the denoised images or the intermediate latents ready to be
+            passed to the decoder.
     """
 
     images: Union[List[PIL.Image.Image], np.ndarray]
Original file line number	Diff line number	Diff line change
`@@ -61,13 +61,15 @@`
`61`	`61`	`[`
`62`	`62`	`("stable-diffusion-xl", "StableDiffusionXLModularPipeline"),`
`63`	`63`	`("wan", "WanModularPipeline"),`
	`64`	`+ ("flux", "FluxModularPipeline"),`
`64`	`65`	`]`
`65`	`66`	`)`
`66`	`67`
`67`	`68`	`MODULAR_PIPELINE_BLOCKS_MAPPING = OrderedDict(`
`68`	`69`	`[`
`69`	`70`	`("StableDiffusionXLModularPipeline", "StableDiffusionXLAutoBlocks"),`
`70`	`71`	`("WanModularPipeline", "WanAutoBlocks"),`
	`72`	`+ ("FluxModularPipeline", "FluxAutoBlocks"),`
`71`	`73`	`]`
`72`	`74`	`)`
`73`	`75`