modelscope
diff --git a/‎README.md
Lines changed: 2 additions & 2 deletions b/‎README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎diffsynth_engine/models/basic/attention.py
Lines changed: 0 additions & 2 deletions b/‎diffsynth_engine/models/basic/attention.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎diffsynth_engine/models/flux/flux_dit.py
Lines changed: 65 additions & 25 deletions b/‎diffsynth_engine/models/flux/flux_dit.py
Lines changed: 65 additions & 25 deletions
diff --git a/‎diffsynth_engine/models/wan/wan_dit.py
Lines changed: 17 additions & 42 deletions b/‎diffsynth_engine/models/wan/wan_dit.py
Lines changed: 17 additions & 42 deletions
diff --git a/‎diffsynth_engine/pipelines/base.py
Lines changed: 58 additions & 6 deletions b/‎diffsynth_engine/pipelines/base.py
Lines changed: 58 additions & 6 deletions
@@ -45,7 +45,7 @@ Text to image
 ```python
 from diffsynth_engine import fetch_model, FluxImagePipeline
 
-model_path = fetch_model("muse/flux-with-vae", path="flux_with_vae.safetensors")
+model_path = fetch_model("muse/flux-with-vae", path="flux1-dev-with-vae.safetensors")
 pipe = FluxImagePipeline.from_pretrained(model_path, device='cuda:0')
 image = pipe(prompt="a cat")
 image.save("image.png")
@@ -54,7 +54,7 @@ Text to image with LoRA
 ```python
 from diffsynth_engine import fetch_model, FluxImagePipeline
 
-model_path = fetch_model("muse/flux-with-vae", path="flux_with_vae.safetensors")
+model_path = fetch_model("muse/flux-with-vae", path="flux1-dev-with-vae.safetensors")
 lora_path = fetch_model("DonRat/MAJICFLUS_SuperChinesestyleheongsam", path="麦橘超国风旗袍.safetensors")
 
 pipe = FluxImagePipeline.from_pretrained(model_path, device='cuda:0')
 
@@ -201,10 +201,8 @@ def long_context_attention(
     assert attn_impl in [
         None,
         "auto",
-        "eager",
         "flash_attn_2",
         "flash_attn_3",
-        "xformers",
         "sdpa",
         "sage_attn",
         "sparge_attn",
 
@@ -13,11 +13,12 @@
 )
 from diffsynth_engine.models.basic.timestep import TimestepEmbeddings
 from diffsynth_engine.models.base import PreTrainedModel, StateDictConverter
+from diffsynth_engine.models.basic import attention as attention_ops
 from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.utils.gguf import gguf_inference
 from diffsynth_engine.utils.fp8_linear import fp8_inference
 from diffsynth_engine.utils.constants import FLUX_DIT_CONFIG_FILE
-from diffsynth_engine.models.basic.attention import attention
+from diffsynth_engine.utils.parallel import sequence_parallel, sequence_parallel_unshard
 from diffsynth_engine.utils import logging
 
 
@@ -198,7 +199,7 @@ def forward(self, image, text, rope_emb, image_emb):
         k = torch.cat([self.norm_k_b(k_b), self.norm_k_a(k_a)], dim=1)
         v = torch.cat([v_b, v_a], dim=1)
         q, k = apply_rope(q, k, rope_emb)
-        attn_out = attention(q, k, v, attn_impl=self.attn_impl)
+        attn_out = attention_ops.attention(q, k, v, attn_impl=self.attn_impl)
         attn_out = rearrange(attn_out, "b s h d -> b s (h d)").to(q.dtype)
         text_out, image_out = attn_out[:, : text.shape[1]], attn_out[:, text.shape[1] :]
         image_out, text_out = self.attention_callback(
@@ -286,7 +287,7 @@ def attention_callback(self, attn_out, x, q, k, v, rope_emb, image_emb):
     def forward(self, x, rope_emb, image_emb):
         q, k, v = rearrange(self.to_qkv(x), "b s (h d) -> b s h d", h=(3 * self.num_heads)).chunk(3, dim=2)
         q, k = apply_rope(self.norm_q_a(q), self.norm_k_a(k), rope_emb)
-        attn_out = attention(q, k, v, attn_impl=self.attn_impl)
+        attn_out = attention_ops.attention(q, k, v, attn_impl=self.attn_impl)
         attn_out = rearrange(attn_out, "b s h d -> b s (h d)").to(q.dtype)
         return self.attention_callback(attn_out=attn_out, x=x, q=q, k=k, v=v, rope_emb=rope_emb, image_emb=image_emb)
 
@@ -324,6 +325,7 @@ def __init__(
         self,
         in_channel: int = 64,
         attn_impl: Optional[str] = None,
+        use_usp: bool = False,
         device: str = "cuda:0",
         dtype: torch.dtype = torch.bfloat16,
     ):
@@ -349,6 +351,8 @@ def __init__(
         self.final_norm_out = AdaLayerNorm(3072, device=device, dtype=dtype)
         self.final_proj_out = nn.Linear(3072, 64, device=device, dtype=dtype)
 
+        self.use_usp = use_usp
+
     def patchify(self, hidden_states):
         hidden_states = rearrange(hidden_states, "B C (H P) (W Q) -> B (H W) (C P Q)", P=2, Q=2)
         return hidden_states
@@ -359,7 +363,8 @@ def unpatchify(self, hidden_states, height, width):
         )
         return hidden_states
 
-    def prepare_image_ids(self, latents):
+    @staticmethod
+    def prepare_image_ids(latents: torch.Tensor):
         batch_size, _, height, width = latents.shape
         latent_image_ids = torch.zeros(height // 2, width // 2, 3)
         latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
@@ -389,7 +394,14 @@ def forward(
         controlnet_single_block_output=None,
         **kwargs,
     ):
-        height, width = hidden_states.shape[-2:]
+        h, w = hidden_states.shape[-2:]
+        controlnet_double_block_output = (
+            controlnet_double_block_output if controlnet_double_block_output is not None else ()
+        )
+        controlnet_single_block_output = (
+            controlnet_single_block_output if controlnet_single_block_output is not None else ()
+        )
+
         fp8_linear_enabled = getattr(self, "fp8_linear_enabled", False)
         with fp8_inference(fp8_linear_enabled), gguf_inference():
             if image_ids is None:
@@ -402,28 +414,54 @@ def forward(
                 guidance = guidance * 1000
                 conditioning += self.guidance_embedder(guidance, hidden_states.dtype)
             conditioning += self.pooled_text_embedder(pooled_prompt_emb)
-            prompt_emb = self.context_embedder(prompt_emb)
             rope_emb = self.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
+            text_rope_emb = rope_emb[:, :, : text_ids.size(1)]
+            image_rope_emb = rope_emb[:, :, text_ids.size(1) :]
             hidden_states = self.patchify(hidden_states)
-            hidden_states = self.x_embedder(hidden_states)
-            for i, block in enumerate(self.blocks):
-                hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, rope_emb, image_emb)
-                if controlnet_double_block_output is not None:
-                    interval_control = len(self.blocks) / len(controlnet_double_block_output)
-                    interval_control = int(np.ceil(interval_control))
-                    hidden_states = hidden_states + controlnet_double_block_output[i // interval_control]
-            hidden_states = torch.cat([prompt_emb, hidden_states], dim=1)
-            for i, block in enumerate(self.single_blocks):
-                hidden_states = block(hidden_states, conditioning, rope_emb, image_emb)
-                if controlnet_single_block_output is not None:
-                    interval_control = len(self.single_blocks) / len(controlnet_double_block_output)
-                    interval_control = int(np.ceil(interval_control))
-                    hidden_states = hidden_states + controlnet_single_block_output[i // interval_control]
-
-            hidden_states = hidden_states[:, prompt_emb.shape[1] :]
-            hidden_states = self.final_norm_out(hidden_states, conditioning)
-            hidden_states = self.final_proj_out(hidden_states)
-            hidden_states = self.unpatchify(hidden_states, height, width)
+
+            with sequence_parallel(
+                (
+                    hidden_states,
+                    prompt_emb,
+                    text_rope_emb,
+                    image_rope_emb,
+                    *controlnet_double_block_output,
+                    *controlnet_single_block_output,
+                ),
+                seq_dims=(
+                    1,
+                    1,
+                    2,
+                    2,
+                    *(1 for _ in controlnet_double_block_output),
+                    *(1 for _ in controlnet_single_block_output),
+                ),
+                enabled=self.use_usp,
+            ):
+                hidden_states = self.x_embedder(hidden_states)
+                prompt_emb = self.context_embedder(prompt_emb)
+                rope_emb = torch.cat((text_rope_emb, image_rope_emb), dim=2)
+
+                for i, block in enumerate(self.blocks):
+                    hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, rope_emb, image_emb)
+                    if len(controlnet_double_block_output) > 0:
+                        interval_control = len(self.blocks) / len(controlnet_double_block_output)
+                        interval_control = int(np.ceil(interval_control))
+                        hidden_states = hidden_states + controlnet_double_block_output[i // interval_control]
+                hidden_states = torch.cat([prompt_emb, hidden_states], dim=1)
+                for i, block in enumerate(self.single_blocks):
+                    hidden_states = block(hidden_states, conditioning, rope_emb, image_emb)
+                    if len(controlnet_single_block_output) > 0:
+                        interval_control = len(self.single_blocks) / len(controlnet_double_block_output)
+                        interval_control = int(np.ceil(interval_control))
+                        hidden_states = hidden_states + controlnet_single_block_output[i // interval_control]
+
+                hidden_states = hidden_states[:, prompt_emb.shape[1] :]
+                hidden_states = self.final_norm_out(hidden_states, conditioning)
+                hidden_states = self.final_proj_out(hidden_states)
+                (hidden_states,) = sequence_parallel_unshard((hidden_states,), seq_dims=(1,), seq_lens=(h * w // 4,))
+
+            hidden_states = self.unpatchify(hidden_states, h, w)
             return hidden_states
 
     @classmethod
@@ -434,6 +472,7 @@ def from_state_dict(
         dtype: torch.dtype,
         in_channel: int = 64,
         attn_impl: Optional[str] = None,
+        use_usp: bool = False,
     ):
         with no_init_weights():
             model = torch.nn.utils.skip_init(
@@ -442,6 +481,7 @@ def from_state_dict(
                 dtype=dtype,
                 in_channel=in_channel,
                 attn_impl=attn_impl,
+                use_usp=use_usp,
             )
             model = model.requires_grad_(False)  # for loading gguf
         model.load_state_dict(state_dict, assign=True)
 
@@ -2,12 +2,11 @@
 import json
 import torch
 import torch.nn as nn
-import torch.distributed as dist
 from typing import Tuple, Optional
 from einops import rearrange
 
 from diffsynth_engine.models.base import StateDictConverter, PreTrainedModel
-from diffsynth_engine.models.basic.attention import attention, long_context_attention
+from diffsynth_engine.models.basic import attention as attention_ops
 from diffsynth_engine.models.basic.transformer_helper import RMSNorm
 from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.utils.constants import (
@@ -17,11 +16,7 @@
     WAN_DIT_14B_FLF2V_CONFIG_FILE,
 )
 from diffsynth_engine.utils.gguf import gguf_inference
-from diffsynth_engine.utils.parallel import (
-    get_sp_group,
-    get_sp_world_size,
-    get_sp_rank,
-)
+from diffsynth_engine.utils.parallel import sequence_parallel, sequence_parallel_unshard
 
 T5_TOKEN_NUM = 512
 FLF_TOKEN_NUM = 257 * 2
@@ -90,20 +85,12 @@ def forward(self, x, freqs):
         q = rearrange(q, "b s (n d) -> b s n d", n=num_heads)
         k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
         v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
-        if getattr(self, "use_usp", False):
-            x = long_context_attention(
-                q=rope_apply(q, freqs),
-                k=rope_apply(k, freqs),
-                v=v,
-                attn_impl=self.attn_impl,
-            )
-        else:
-            x = attention(
-                q=rope_apply(q, freqs),
-                k=rope_apply(k, freqs),
-                v=v,
-                attn_impl=self.attn_impl,
-            )
+        x = attention_ops.attention(
+            q=rope_apply(q, freqs),
+            k=rope_apply(k, freqs),
+            v=v,
+            attn_impl=self.attn_impl,
+        )
         x = x.flatten(2)
         return self.o(x)
 
@@ -148,12 +135,12 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
         v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
 
-        x = attention(q, k, v, attn_impl=self.attn_impl).flatten(2)
+        x = attention_ops.attention(q, k, v, attn_impl=self.attn_impl).flatten(2)
         if self.has_image_input:
             k_img, v_img = self.norm_k_img(self.k_img(img)), self.v_img(img)
             k_img = rearrange(k_img, "b s (n d) -> b s n d", n=num_heads)
             v_img = rearrange(v_img, "b s (n d) -> b s n d", n=num_heads)
-            y = attention(q, k_img, v_img, attn_impl=self.attn_impl).flatten(2)
+            y = attention_ops.attention(q, k_img, v_img, attn_impl=self.attn_impl).flatten(2)
             x = x + y
         return self.o(x)
 
@@ -316,10 +303,7 @@ def __init__(
         if has_image_input:
             self.img_emb = MLP(1280, dim, flf_pos_emb, device=device, dtype=dtype)  # clip_feature_dim = 1280
 
-        if use_usp:
-            setattr(self, "use_usp", True)
-            for block in self.blocks:
-                setattr(block.self_attn, "use_usp", True)
+        self.use_usp = use_usp
 
     def patchify(self, x: torch.Tensor):
         x = self.patch_embedding(x)  # b c f h w -> b 4c f h/2 w/2
@@ -368,21 +352,12 @@ def forward(
                 .reshape(f * h * w, 1, -1)
                 .to(x.device)
             )
-            if getattr(self, "use_usp", False):
-                s, p = x.size(1), get_sp_world_size()  # (sequence_length, parallelism)
-                split_size = [s // p + 1 if i < s % p else s // p for i in range(p)]
-                x = torch.split(x, split_size, dim=1)[get_sp_rank()]
-                freqs = torch.split(freqs, split_size, dim=0)[get_sp_rank()]
-
-            for block in self.blocks:
-                x = block(x, context, t_mod, freqs)
-            x = self.head(x, t)
-
-            if getattr(self, "use_usp", False):
-                b, d = x.size(0), x.size(2)  # (batch_size, out_dim)
-                xs = [torch.zeros((b, s, d), dtype=x.dtype, device=x.device) for s in split_size]
-                dist.all_gather(xs, x, group=get_sp_group())
-                x = torch.concat(xs, dim=1)
+
+            with sequence_parallel([x, freqs], seq_dims=(1, 0), enabled=self.use_usp):
+                for block in self.blocks:
+                    x = block(x, context, t_mod, freqs)
+                x = self.head(x, t)
+                (x,) = sequence_parallel_unshard((x,), seq_dims=(1,), seq_lens=(f * h * w,))
             x = self.unpatchify(x, (f, h, w))
             return x
 
 
@@ -26,14 +26,21 @@ def convert(self, lora_state_dict: Dict[str, torch.Tensor]) -> Dict[str, Dict[st
 class BasePipeline:
     lora_converter = LoRAStateDictConverter()
 
-    def __init__(self, vae_tiled, vae_tile_size, vae_tile_stride, device="cuda:0", dtype=torch.float16):
+    def __init__(
+        self,
+        vae_tiled: bool = False,
+        vae_tile_size: int = -1,
+        vae_tile_stride: int = -1,
+        device="cuda:0",
+        dtype=torch.float16,
+    ):
         super().__init__()
-        self.device = device
-        self.dtype = dtype
-        self.offload_mode = None
         self.vae_tiled = vae_tiled
         self.vae_tile_size = vae_tile_size
         self.vae_tile_stride = vae_tile_stride
+        self.device = device
+        self.dtype = dtype
+        self.offload_mode = None
         self.model_names = []
 
     @classmethod
@@ -199,8 +206,53 @@ def eval(self):
                 model.eval()
         return self
 
-    def enable_fp8_linear(self):
-        raise NotImplementedError()
+    @staticmethod
+    def init_parallel_config(
+        parallelism: int,
+        use_cfg_parallel: bool,
+        model_config: ModelConfig,
+    ):
+        assert parallelism in (2, 4, 8), "parallelism must be 2, 4 or 8"
+        cfg_degree = 2 if use_cfg_parallel else 1
+        sp_ulysses_degree = getattr(model_config, "sp_ulysses_degree", None)
+        sp_ring_degree = getattr(model_config, "sp_ring_degree", None)
+        tp_degree = getattr(model_config, "tp_degree", None)
+        use_fsdp = getattr(model_config, "use_fsdp", False)
+
+        if tp_degree is not None:
+            assert sp_ulysses_degree is None and sp_ring_degree is None, (
+                "not allowed to enable sequence parallel and tensor parallel together; "
+                "either set sp_ulysses_degree=None, sp_ring_degree=None or set tp_degree=None during pipeline initialization"
+            )
+            assert use_fsdp is False, (
+                "not allowed to enable fully sharded data parallel and tensor parallel together; "
+                "either set use_fsdp=False or set tp_degree=None during pipeline initialization"
+            )
+            assert parallelism == cfg_degree * tp_degree, (
+                f"parallelism ({parallelism}) must be equal to cfg_degree ({cfg_degree}) * tp_degree ({tp_degree})"
+            )
+            sp_ulysses_degree = 1
+            sp_ring_degree = 1
+        elif sp_ulysses_degree is None and sp_ring_degree is None:
+            # use ulysses if not specified
+            sp_ulysses_degree = parallelism // cfg_degree
+            sp_ring_degree = 1
+            tp_degree = 1
+        elif sp_ulysses_degree is not None and sp_ring_degree is not None:
+            assert parallelism == cfg_degree * sp_ulysses_degree * sp_ring_degree, (
+                f"parallelism ({parallelism}) must be equal to cfg_degree ({cfg_degree}) * "
+                f"sp_ulysses_degree ({sp_ulysses_degree}) * sp_ring_degree ({sp_ring_degree})"
+            )
+            tp_degree = 1
+        else:
+            raise ValueError("sp_ulysses_degree and sp_ring_degree must be specified together")
+        return {
+            "cfg_degree": cfg_degree,
+            "sp_ulysses_degree": sp_ulysses_degree,
+            "sp_ring_degree": sp_ring_degree,
+            "tp_degree": tp_degree,
+            "use_fsdp": use_fsdp,
+        }
 
     @staticmethod
     def validate_offload_mode(offload_mode: str | None):