support fp8 store bf16 exec (#120)

tenderness-git · web-flow · commit 714ae6659d70 · 2025-07-14T11:22:24.000+08:00
* support fp8 store bf16 exec

* fix
diff --git a/diffsynth_engine/models/basic/lora.py b/diffsynth_engine/models/basic/lora.py
@@ -37,14 +37,23 @@ def apply_to(self, w: Union[nn.Linear, nn.Conv2d, nn.Parameter, torch.Tensor]):
         else:
             delta_w = self.scale * (self.alpha / self.rank) * (self.up.weight @ self.down.weight)
         if isinstance(w, (nn.Linear, nn.Conv2d)):
-            delta_w = delta_w.to(device=w.weight.data.device, dtype=w.weight.data.dtype)
+            delta_w = delta_w.to(device=w.weight.data.device, dtype=self.dtype)
+            w_dtype = w.weight.data.dtype
+            w.weight.data = w.weight.data.to(self.dtype)
             w.weight.data.add_(delta_w)
+            w.weight.data = w.weight.data.to(w_dtype)
         elif isinstance(w, nn.Parameter):
-            delta_w = delta_w.to(device=w.data.device, dtype=w.data.dtype)
+            delta_w = delta_w.to(device=w.data.device, dtype=self.dtype)
+            w_dtype = w.data.dtype
+            w.data = w.data.to(self.dtype)
             w.data.add_(delta_w)
+            w.data = w.data.to(w_dtype)
         elif isinstance(w, torch.Tensor):
-            delta_w = delta_w.to(device=w.device, dtype=w.dtype)
+            delta_w = delta_w.to(device=w.device, dtype=self.dtype)
+            w_dtype = w.dtype
+            w = w.to(self.dtype)
             w.add_(delta_w)
+            w = w.to(w_dtype)
 
 
 class LoRALinear(nn.Linear):
@@ -60,8 +69,8 @@ def __init__(
         # LoRA
         self._lora_dict = OrderedDict()
         # Frozen LoRA
-        self._frozen_lora_list = []
-        self.register_buffer("_original_weight", None)
+        self.patched_frozen_lora = False
+        self._original_weight = None
 
     @staticmethod
     def from_linear(linear: nn.Linear):
@@ -118,20 +127,27 @@ def add_frozen_lora(
         save_original_weight: bool = True,
     ):
         if save_original_weight and self._original_weight is None:
-            self._original_weight = self.weight.clone()
+            if self.weight.dtype == torch.float8_e4m3fn:
+                self._original_weight = self.weight.to(dtype=torch.bfloat16, device="cpu", copy=True).pin_memory()
+            else:
+                self._original_weight = self.weight.to(device="cpu", copy=True).pin_memory()
         lora = LoRA(scale, rank, alpha, up, down, device, dtype)
         lora.apply_to(self)
-        self._frozen_lora_list.append(lora)
+        self.patched_frozen_lora = True
 
-    def clear(self):
-        if self._original_weight is None and len(self._frozen_lora_list) > 0:
+    def clear(self, release_all_cpu_memory: bool = False):
+        if self.patched_frozen_lora and self._original_weight is None:
             raise RuntimeError(
                 "Current LoRALinear has patched by frozen LoRA, but original weight is not saved, so you cannot clear LoRA."
             )
         self._lora_dict.clear()
-        self._frozen_lora_list = []
         if self._original_weight is not None:
-            self.weight.data.copy_(self._original_weight)
+            self.weight.data.copy_(
+                self._original_weight.to(device=self.weight.data.device, dtype=self.weight.data.dtype)
+            )
+            if release_all_cpu_memory:
+                del self._original_weight
+            self.patched_frozen_lora = False
 
     def forward(self, x):
         w_x = super().forward(x)
@@ -161,8 +177,8 @@ def __init__(
         # LoRA
         self._lora_dict = OrderedDict()
         # Frozen LoRA
-        self._frozen_lora_list = []
         self._original_weight = None
+        self.patched_frozen_lora = False
 
     @staticmethod
     def from_conv2d(conv2d: nn.Conv2d):
@@ -257,21 +273,25 @@ def add_frozen_lora(
         save_original_weight: bool = True,
     ):
         if save_original_weight and self._original_weight is None:
-            self._original_weight = self.weight.clone()
+            if self.weight.dtype == torch.float8_e4m3fn:
+                self._original_weight = self.weight.to(dtype=torch.bfloat16, device="cpu", copy=True).pin_memory()
+            else:
+                self._original_weight = self.weight.to(device="cpu", copy=True).pin_memory()
         lora = self._construct_lora(name, scale, rank, alpha, up, down, device, dtype)
         lora.apply_to(self)
-        self._frozen_lora_list.append(lora)
+        self.patched_frozen_lora = True
 
-    def clear(self):
-        if self._original_weight is None and len(self._frozen_lora_list) > 0:
+    def clear(self, release_all_cpu_memory: bool = False):
+        if self.patched_frozen_lora and self._original_weight is None:
             raise RuntimeError(
                 "Current LoRALinear has patched by frozen LoRA, but original weight is not saved, so you cannot clear LoRA."
             )
         self._lora_dict.clear()
-        self._frozen_lora_list = []
         if self._original_weight is not None:
-            self.weight.copy_(self._original_weight)
-            self._original_weight = None
+            self.weight.copy_(self._original_weight.to(device=self.weight.device, dtype=self.weight.dtype))
+            if release_all_cpu_memory:
+                del self._original_weight
+            self.patched_frozen_lora = False
 
     def forward(self, x):
         w_x = super().forward(x)
diff --git a/diffsynth_engine/models/basic/transformer_helper.py b/diffsynth_engine/models/basic/transformer_helper.py
@@ -1,6 +1,5 @@
 import torch
 import torch.nn as nn
-import math
 
 
 def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor):
@@ -83,13 +82,3 @@ def forward(self, x):
         if self.elementwise_affine:
             return norm_result * self.weight
         return norm_result
-
-
-class NewGELUActivation(nn.Module):
-    """
-    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
-    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
-    """
-
-    def forward(self, input: "torch.Tensor") -> "torch.Tensor":
-        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
diff --git a/diffsynth_engine/models/flux/flux_dit.py b/diffsynth_engine/models/flux/flux_dit.py
@@ -435,7 +435,7 @@ def forward(
             # addition of floating point numbers does not meet commutative law
             conditioning = self.time_embedder(timestep, hidden_states.dtype)
             if self.guidance_embedder is not None:
-                guidance = guidance * 1000
+                guidance = (guidance.to(torch.float32) * 1000).to(hidden_states.dtype)
                 conditioning += self.guidance_embedder(guidance, hidden_states.dtype)
             conditioning += self.pooled_text_embedder(pooled_prompt_emb)
             rope_emb = self.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
diff --git a/diffsynth_engine/models/text_encoder/t5.py b/diffsynth_engine/models/text_encoder/t5.py
@@ -4,7 +4,7 @@
 
 from diffsynth_engine.models.base import PreTrainedModel, StateDictConverter
 from diffsynth_engine.models.basic.relative_position_emb import RelativePositionEmbedding
-from diffsynth_engine.models.basic.transformer_helper import RMSNorm, NewGELUActivation
+from diffsynth_engine.models.basic.transformer_helper import RMSNorm
 from diffsynth_engine.models.basic.attention import Attention
 from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.utils.gguf import gguf_inference
@@ -21,14 +21,12 @@ def __init__(self, d_model, d_ff, dropout_rate, device: str = "cuda:0", dtype: t
         self.wi_1 = nn.Linear(d_model, d_ff, bias=False, device=device, dtype=dtype)
         self.wo = nn.Linear(d_ff, d_model, bias=False, device=device, dtype=dtype)
         self.dropout = nn.Dropout(dropout_rate)
-        self.act = NewGELUActivation()
+        self.act = nn.GELU(approximate="tanh")
 
     def forward(self, hidden_states):
         hidden_gelu = self.act(self.wi_0(hidden_states))
         hidden_linear = self.wi_1(hidden_states)
         hidden_states = self.dropout(hidden_gelu * hidden_linear)
-
-        hidden_states = hidden_states.to(self.wo.weight.dtype)
         hidden_states = self.wo(hidden_states)
         return hidden_states
 
diff --git a/diffsynth_engine/pipelines/base.py b/diffsynth_engine/pipelines/base.py
@@ -5,6 +5,7 @@
 from PIL import Image
 from dataclasses import dataclass
 from diffsynth_engine.utils.offload import enable_sequential_cpu_offload
+from diffsynth_engine.utils.fp8_linear import enable_fp8_autocast
 from diffsynth_engine.utils.gguf import load_gguf_checkpoint
 from diffsynth_engine.utils import logging
 from diffsynth_engine.utils.loader import load_file
@@ -100,7 +101,10 @@ def load_model_checkpoint(
             if not os.path.isfile(path):
                 raise FileNotFoundError(f"{path} is not a file")
             elif path.endswith(".safetensors"):
-                state_dict.update(**load_file(path, device=device))
+                state_dict_ = load_file(path, device=device)
+                for key, value in state_dict_.items():
+                    state_dict[key] = value.to(dtype)
+
             elif path.endswith(".gguf"):
                 state_dict.update(**load_gguf_checkpoint(path, device=device, dtype=dtype))
             else:
@@ -154,7 +158,7 @@ def vae_output_to_image(vae_output: torch.Tensor) -> Image.Image:
     @staticmethod
     def generate_noise(shape, seed=None, device="cpu", dtype=torch.float16):
         generator = None if seed is None else torch.Generator(device).manual_seed(seed)
-        noise = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        noise = torch.randn(shape, generator=generator, device=device).to(dtype)
         return noise
 
     def encode_image(
@@ -294,6 +298,15 @@ def _enable_sequential_cpu_offload(self):
                 enable_sequential_cpu_offload(model, self.device)
         self.offload_mode = "sequential_cpu_offload"
 
+    def enable_fp8_autocast(
+        self, model_names: List[str], compute_dtype: torch.dtype = torch.bfloat16, use_fp8_linear: bool = False
+    ):
+        for model_name in model_names:
+            model = getattr(self, model_name)
+            if model is not None:
+                enable_fp8_autocast(model, compute_dtype, use_fp8_linear)
+        self.fp8_autocast_enabled = True
+
     def load_models_to_device(self, load_model_names: List[str] | None = None):
         load_model_names = load_model_names if load_model_names else []
         # only load models to device if offload_mode is set
diff --git a/diffsynth_engine/pipelines/flux_image.py b/diffsynth_engine/pipelines/flux_image.py
@@ -526,29 +526,29 @@ def from_pretrained(
         model_config = (
             model_path_or_config
             if isinstance(model_path_or_config, FluxModelConfig)
-            else FluxModelConfig(dit_path=model_path_or_config, dit_dtype=dtype, t5_dtype=dtype, clip_dtype=dtype)
+            else FluxModelConfig(dit_path=model_path_or_config, dit_dtype=dtype, t5_dtype=dtype)
         )
         if model_config.vae_path is None:
-            model_config.vae_path = fetch_model("muse/flux_vae", revision="20241015120836", path="ae.safetensors")
+            model_config.vae_path = fetch_model("muse/FLUX.1-dev-fp8", path="ae-bf16.safetensors")
 
         if model_config.clip_path is None and load_text_encoder:
-            model_config.clip_path = fetch_model(
-                "muse/flux_clip_l", revision="20241209", path="clip_l_bf16.safetensors"
-            )
+            model_config.clip_path = fetch_model("muse/FLUX.1-dev-fp8", path="clip-bf16.safetensors")
         if model_config.t5_path is None and load_text_encoder:
             model_config.t5_path = fetch_model(
-                "muse/google_t5_v1_1_xxl", revision="20241024105236", path="t5xxl_v1_1_bf16.safetensors"
+                "muse/FLUX.1-dev-fp8", path=["t5-fp8-00001-of-00002.safetensors", "t5-fp8-00002-of-00002.safetensors"]
             )
 
         logger.info(f"loading state dict from {model_config.dit_path} ...")
-        dit_state_dict = cls.load_model_checkpoint(model_config.dit_path, device="cpu", dtype=dtype)
+        dit_state_dict = cls.load_model_checkpoint(model_config.dit_path, device="cpu", dtype=model_config.dit_dtype)
         logger.info(f"loading state dict from {model_config.vae_path} ...")
-        vae_state_dict = cls.load_model_checkpoint(model_config.vae_path, device="cpu", dtype=dtype)
+        vae_state_dict = cls.load_model_checkpoint(model_config.vae_path, device="cpu", dtype=model_config.vae_dtype)
         if load_text_encoder:
             logger.info(f"loading state dict from {model_config.clip_path} ...")
-            clip_state_dict = cls.load_model_checkpoint(model_config.clip_path, device="cpu", dtype=dtype)
+            clip_state_dict = cls.load_model_checkpoint(
+                model_config.clip_path, device="cpu", dtype=model_config.clip_dtype
+            )
             logger.info(f"loading state dict from {model_config.t5_path} ...")
-            t5_state_dict = cls.load_model_checkpoint(model_config.t5_path, device="cpu", dtype=dtype)
+            t5_state_dict = cls.load_model_checkpoint(model_config.t5_path, device="cpu", dtype=model_config.t5_dtype)
 
         init_device = "cpu" if parallelism > 1 or offload_mode is not None else device
         if load_text_encoder:
@@ -602,10 +602,20 @@ def from_pretrained(
             vae_tile_stride=vae_tile_stride,
             control_type=control_type,
             device=device,
-            dtype=dtype,
+            dtype=model_config.dit_dtype,
         )
-        if offload_mode is not None:
-            pipe.enable_cpu_offload(offload_mode)
+        pipe.enable_cpu_offload(offload_mode)
+        if model_config.dit_dtype == torch.float8_e4m3fn:
+            pipe.dtype = torch.bfloat16  # running dtype
+            pipe.enable_fp8_autocast(
+                model_names=["dit"], compute_dtype=pipe.dtype, use_fp8_linear=model_config.use_fp8_linear
+            )
+
+        if model_config.t5_dtype == torch.float8_e4m3fn:
+            pipe.dtype = torch.bfloat16  # running dtype
+            pipe.enable_fp8_autocast(
+                model_names=["text_encoder_2"], compute_dtype=pipe.dtype, use_fp8_linear=model_config.use_fp8_linear
+            )
 
         if parallelism > 1:
             parallel_config = cls.init_parallel_config(parallelism, use_cfg_parallel, model_config)
@@ -803,7 +813,6 @@ def predict_noise(
             current_step=current_step,
             total_step=total_step,
         )
-
         self.load_models_to_device(["dit"])
 
         noise_pred = self.dit(
diff --git a/diffsynth_engine/utils/fp8_linear.py b/diffsynth_engine/utils/fp8_linear.py
@@ -4,6 +4,45 @@
 from contextlib import contextmanager
 
 
+def enable_fp8_autocast(module: nn.Module, compute_dtype: torch.dtype = torch.bfloat16, use_fp8_linear: bool = False):
+    if len(list(module.children())) == 0:
+        if len(list(module.parameters())) > 0:
+            add_fp8_autocast_hook(module, compute_dtype)
+        return
+    if len(list(module.parameters(recurse=False))) > 0:
+        add_fp8_autocast_hook(module, compute_dtype)
+    for submodule in module.children():
+        if isinstance(submodule, nn.Linear) and use_fp8_linear:
+            continue
+
+        enable_fp8_autocast(submodule, compute_dtype, use_fp8_linear)
+
+
+def add_fp8_autocast_hook(module: nn.Module, compute_dtype: torch.dtype = torch.bfloat16):
+    def _fp8_autocast_pre_hook(module: nn.Module, input_):
+        for name, param in module.named_parameters():
+            if param.dtype == torch.float8_e4m3fn:
+                param.data = param.data.to(compute_dtype)
+        new_inputs = []
+        for x in input_:
+            if isinstance(x, torch.Tensor) and x.dtype in [torch.float8_e4m3fn, torch.float16, torch.bfloat16]:
+                new_inputs.append(x.to(compute_dtype))
+            else:
+                new_inputs.append(x)
+        return tuple(new_inputs)
+
+    def _fp8_autocast_hook(module: nn.Module, input_, output_):
+        for name, param in module.named_parameters():
+            if param.dtype == compute_dtype:
+                param.data = param.data.to(torch.float8_e4m3fn)
+
+    if getattr(module, "_fp8_autocast_enabled", False):
+        return
+    module.register_forward_pre_hook(_fp8_autocast_pre_hook)
+    module.register_forward_hook(_fp8_autocast_hook)
+    setattr(module, "_fp8_autocast_enabled", True)
+
+
 def enable_fp8_linear(module: nn.Module):
     _enable_fp8_linear(module)
     setattr(module, "fp8_linear_enabled", True)
diff --git a/tests/data/expect/flux/flux_lora.png b/tests/data/expect/flux/flux_lora.png
diff --git a/tests/data/expect/flux/flux_lora_unfused.png b/tests/data/expect/flux/flux_lora_unfused.png
diff --git a/tests/data/expect/flux/flux_txt2img.png b/tests/data/expect/flux/flux_txt2img.png
diff --git a/tests/test_pipelines/test_flux_image.py b/tests/test_pipelines/test_flux_image.py
@@ -1,16 +1,29 @@
 import unittest
-
-
 from tests.common.test_case import ImageTestCase
 from diffsynth_engine.pipelines import FluxImagePipeline, FluxModelConfig
 from diffsynth_engine import fetch_model
+import torch
 
 
 class TestFLUXImage(ImageTestCase):
     @classmethod
     def setUpClass(cls):
-        model_path = fetch_model("muse/flux-with-vae", revision="20240902173035", path="flux1-dev-with-vae.safetensors")
-        cls.pipe = FluxImagePipeline.from_pretrained(model_path).eval()
+        model_path = fetch_model(
+            "muse/FLUX.1-dev-fp8",
+            path=[
+                "dit-fp8-00001-of-00004.safetensors",
+                "dit-fp8-00002-of-00004.safetensors",
+                "dit-fp8-00003-of-00004.safetensors",
+                "dit-fp8-00004-of-00004.safetensors",
+            ],
+        )
+        config = FluxModelConfig(
+            dit_path=model_path,
+            dit_dtype=torch.float8_e4m3fn,
+            t5_dtype=torch.float8_e4m3fn,
+            # use_fp8_linear=True, # only support for hopper, ada, blackwell
+        )
+        cls.pipe = FluxImagePipeline.from_pretrained(config, offload_mode="cpu_offload").eval()
 
     def test_txt2img(self):
         image = self.pipe(
@@ -46,7 +59,7 @@ def test_unfused_lora(self):
             seed=42,
         )
         self.pipe.unload_loras()
-        self.assertImageEqualAndSaveFailed(image, "flux/flux_lora.png", threshold=0.98)
+        self.assertImageEqualAndSaveFailed(image, "flux/flux_lora_unfused.png", threshold=0.98)
 
     def test_diffusers_lora_patch(self):
         lora_model_path = fetch_model(