From bf301c4094b4713c9680f7b0357f12bbb7ad09f0 Mon Sep 17 00:00:00 2001 From: Dong1017 Date: Mon, 27 Oct 2025 14:51:11 +0800 Subject: [PATCH 1/8] fix: diffusers merged PR 12219 --- .../finetune_lora_with_mindspore_trainer.py | 519 ++++++++++++++++++ examples/diffusers/qwenimage/zero3.json | 5 + .../transformers/transformer_qwenimage.py | 79 +-- 3 files changed, 569 insertions(+), 34 deletions(-) create mode 100644 examples/diffusers/qwenimage/finetune_lora_with_mindspore_trainer.py create mode 100644 examples/diffusers/qwenimage/zero3.json diff --git a/examples/diffusers/qwenimage/finetune_lora_with_mindspore_trainer.py b/examples/diffusers/qwenimage/finetune_lora_with_mindspore_trainer.py new file mode 100644 index 0000000000..540a0152cb --- /dev/null +++ b/examples/diffusers/qwenimage/finetune_lora_with_mindspore_trainer.py @@ -0,0 +1,519 @@ +""" +Qwen-Image model fine-tuning script using LoRA. + +This script with default values fine-tunes a pretrained Thinker model from Qwen-Image, +on the `lambdalabs/pokemon-blip-captions` dataset for pokemon image generation. + +Usage: +``` +DEVICE_ID=0 python finetune_lora_with_mindspore_trainer.py \ + --model_path Qwen/Qwen-Image \ + --lora_rank 8 \ + --lora_alpha 16 \ + --dataset_path lambdalabs/pokemon-blip-captions \ + --output_dir ./outputs/lora \ + --num_train_epochs 1 \ + --eval_strategy no \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 1 \ + --learning_rate 1e-5 \ + --save_strategy steps \ + --save_steps 500 \ + --logging_steps 1 \ + --save_total_limit 1 \ + --download_num_workers 4 +``` + +with multi-cards: +note that bf16 requires mindspore>=2.7.0: +``` +export ASCEND_RT_VISIBLE_DEVICES=0,1 +NPUS=2 +MASTER_PORT=9000 +LOG_DIR=outputs/lora +msrun --bind_core=True --worker_num=${NPUS} --local_worker_num=${NPUS} --master_port=${MASTER_PORT} --log_dir=${LOG_DIR}/parallel_logs \ +python finetune_lora_with_mindspore_trainer.py \ + --output_dir ${LOG_DIR} \ + --num_train_epochs 1 \ + --learning_rate 1e-5 \ + --save_strategy no \ + --bf16 +``` +""" + +import inspect +import io +import logging +import math +import os +from dataclasses import dataclass, field +from typing import List, Optional, Union + +import evaluate +import numpy as np +from datasets import load_dataset +from PIL import Image +from transformers import HfArgumentParser + +import mindspore as ms +import mindspore.mint.distributed as dist +from mindspore import nn, ops + +from mindone.diffusers import QwenImagePipeline +from mindone.diffusers.training_utils import cast_training_params +from mindone.peft import LoraConfig, get_peft_model, get_peft_model_state_dict +from mindone.trainers import create_optimizer +from mindone.transformers.mindspore_adapter import MindSporeArguments, init_environment +from mindone.transformers.optimization import get_scheduler +from mindone.transformers.trainer import Trainer +from mindone.transformers.training_args import TrainingArguments + +logger = logging.getLogger(__name__) + +IGNORE_INDEX = -100 + + +@dataclass +class MyArguments(MindSporeArguments, TrainingArguments): + amp_opt_level: str = field(default="O0") + dataset_path: str = field(default="lambdalabs/pokemon-blip-captions") + deepspeed: str = field(default="zero3.json") + device_target: str = field(default="Ascend") + do_eval: bool = field(default=False) + enable_flash_attention: bool = field(default=True) + # gradient_checkpointing: bool = field(default=False) # LoRA does not support + is_distribute: bool = field(default=False) + lora_rank: int = field(default=8, metadata={"help": "The dimension of the LoRA update matrices."}) + lora_alpha: int = field(default=16, metadata={"help": "The scaling factor alpha of the LoRA."}) + mode: int = field(default=ms.PYNATIVE_MODE, metadata={"help": "Graph(not supported)/Pynative"}) + model_path: str = field(default="Qwen/Qwen-Image") + output_dir: str = field(default="./outputs") + per_device_train_batch_size: int = field( + default=1, metadata={"help": "Batch size per device for training."} + ) # no use + resume: Union[bool, str] = field(default=False, metadata={"help": "Resume training from a checkpoint."}) + save_strategy: str = field(default="no", metadata={"help": "Save strategy, no, steps or epoch"}) + + +@dataclass +class DataArguments: + dataset_use: str = field(default="") + max_length: int = field(default=4096, metadata={"help": "Fixed token length for training."}) + height: int = field(default=512) + width: int = field(default=512) + num_inference_steps: int = field(default=8, metadata={"help": "Inference steps when denoising in training."}) + + +def freeze_params(m: nn.Cell): + for p in m.get_parameters(): + p.requires_grad = False + + +def main(): + parser = HfArgumentParser((MyArguments, DataArguments)) + args, data_args = parser.parse_args_into_dataclasses() + + init_environment(args) + + dist.init_process_group() + ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL) + local_rank = dist.get_rank() + world_size = dist.get_world_size() + args.rank_size = world_size + args.rank = local_rank + args.zero_stage = 3 + + # 1. Load materials + # 1.1 Load pretrained model from pipe + ms_dtype = ms.bfloat16 if args.bf16 else (ms.float16 if args.fp16 else ms.float32) + parent_model = QwenImagePipeline.from_pretrained( + args.model_path, + mindspore_dtype=ms_dtype, + ) + data_args.vae_config = parent_model.vae.config + data_args.ms_dtype = ms_dtype + + # 1.2 the dataset + dataset = load_dataset("parquet", data_dir=args.dataset_path, split="train") + dataset = dataset.shuffle(seed=42) + train_indices = list(range(666)) + eval_indices = list(range(666, 833)) + + def process_function(examples): + image = Image.open(io.BytesIO(examples["image"]["bytes"])).convert("RGB").resize((512, 512)) + txt = examples["text"] + + # prepare the inputs + encoder_hidden_states, encoder_hidden_states_mask = parent_model.encode_prompt(txt) + height = data_args.height + width = data_args.width + batch_size = encoder_hidden_states.shape[0] + hidden_states = parent_model.prepare_latents( + batch_size=batch_size, + num_channels_latents=parent_model.transformer.config.in_channels // 4, + height=height, + width=width, + dtype=encoder_hidden_states.dtype, + generator=np.random.Generator(np.random.PCG64(seed=42)), + latents=None, + ) + + # prepare the labels: convert the image to latent space + pixel_values = ms.Tensor(np.array(image, dtype=np.float32)) / 255.0 # (H, W, C) = (512, 512, 3) + pixel_values = pixel_values.transpose(2, 0, 1) # (H, W, C) -> (C, H, W) + + # write to dataset + examples["encoder_hidden_states"] = encoder_hidden_states[0].astype(np.float32) + examples["encoder_hidden_states_mask"] = encoder_hidden_states_mask[0].asnumpy() + examples["hidden_states"] = hidden_states[0].astype(np.float32) + examples["txt_seq_lens"] = encoder_hidden_states_mask.shape[-1] + examples["labels"] = pixel_values + + if not args.do_eval: + examples.pop("text") # remove text from examples + examples.pop("image") # remove image from examples + + return examples + + tokenized_datasets = dataset.map(process_function, batched=False) + train_dataset = tokenized_datasets.select(train_indices) + eval_dataset = tokenized_datasets.select(eval_indices) + + dataset_len = len(train_dataset) + num_update_steps_per_epoch = max(1, dataset_len // args.gradient_accumulation_steps) + num_training_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch) + + # 2. Prepare for LoRA + # 2.1. Determine the target model + model = parent_model.transformer + model.config.use_cache = False + model.gradient_checkpointing = True + model.training = True + freeze_params(model) + freeze_params(parent_model.vae) + freeze_params(parent_model.text_encoder) + + # 2.2. Prepare the LoRA config + # all attn linear layers + text_enc_modules = [] + vae_enc_modules = [] + transformer_attn_modules = [] + for i in range(model.config.num_layers): + transformer_attn_modules.append(f"transformer_blocks.{i}.attn.to_q") + transformer_attn_modules.append(f"transformer_blocks.{i}.attn.to_k") + transformer_attn_modules.append(f"transformer_blocks.{i}.attn.to_v") + transformer_attn_modules.append(f"transformer_blocks.{i}.attn.add_q_proj") + transformer_attn_modules.append(f"transformer_blocks.{i}.attn.add_k_proj") + transformer_attn_modules.append(f"transformer_blocks.{i}.attn.add_v_proj") + transformer_attn_modules.append(f"transformer_blocks.{i}.attn.to_out.0") + transformer_attn_modules.append(f"transformer_blocks.{i}.attn.to_add_out") + + target_modules = text_enc_modules + vae_enc_modules + transformer_attn_modules + lora_config = LoraConfig( + r=args.lora_rank, + lora_alpha=args.lora_alpha, + init_lora_weights="gaussian", + target_modules=target_modules, + ) + + model = get_peft_model(model, lora_config) + if args.fp16 or args.bf16: + cast_training_params(model, dtype=ms.float32) + model.print_trainable_parameters() + + # 3. [optional] Prepare the evalutaion metric + if args.do_eval: # TODO: do not support yet + metric = evaluate.load("mse") + + def compute_metrics(eval_pred): + preds, labels = eval_pred + return metric.compute(predictions=preds, references=labels) + + else: + compute_metrics = None + + # 4. Training setups: lr scheduler, optimizer, trainer, etc. + # lr scheduler + lr_scheduler = get_scheduler( + args.lr_scheduler_type, + base_lr=args.learning_rate, + num_warmup_steps=args.warmup_steps, + num_training_steps=num_training_steps, + scheduler_specific_kwargs=args.lr_scheduler_kwargs, + ) + # [required] optimizer + # FIXME: since only train lora layer, + # auto-creating optimizer in transformers Trainer may occur empty params list since there is not trainable layernorm layers. + optimizer_kwargs = { + "name": "adamw", + "betas": (args.adam_beta1, args.adam_beta2), + "eps": args.adam_epsilon, + "lr": lr_scheduler, + } + optimizer = create_optimizer(model.get_base_model().trainable_params(), **optimizer_kwargs) + + # trainer + trainer = Trainer( + # model=model.get_base_model(), # use base model for parsing construct() arguments + model=TrainStepForQwenImage( + model.get_base_model(), + parent_model.vae.decode, + parent_model.scheduler, + parent_model.image_processor.postprocess, + data_args, + ), + args=args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, + optimizers=(optimizer, lr_scheduler), # for LoRA + ) # do not support compute_loss yet + + # trainer.train(resume_from_checkpoint=args.resume) # FIXME: do not support resume training yet + # FIXME: now use the code below temorarily + if isinstance(args.resume, str) or (isinstance(args.resume, bool) and args.resume): + from transformers.trainer_callback import TrainerState + from transformers.trainer_utils import get_last_checkpoint + + TRAINER_STATE_NAME = "trainer_state.json" + resume_from_checkpoint = None + # load potential checkpoint + resume_path = args.resume if isinstance(args.resume, str) else args.output_dir + resume_from_checkpoint = get_last_checkpoint(resume_path) + if resume_from_checkpoint is None: + raise ValueError(f"No valid checkpoint found in {resume_path}.") + trainer._load_from_checkpoint(resume_from_checkpoint) + trainer.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)) + trainer.args.num_train_epochs -= trainer.state.epoch + + # train the model and save the LoRA weights + def save_lora_model(model, output_dir): + if args.zero_stage == 3: + all_gather_op = ops.AllGather() + + transformer_lora_layers_to_save_new = {} + transformer_lora_layers_to_save = get_peft_model_state_dict(model) + + for name, param in transformer_lora_layers_to_save.items(): + if name.startswith("base_model.model."): + name = name.replace("base_model.model.", "") + data = ms.Tensor(all_gather_op(param).asnumpy()) + transformer_lora_layers_to_save_new[name] = data + + if args.rank == 0: + QwenImagePipeline.save_lora_weights( + output_dir, + transformer_lora_layers=transformer_lora_layers_to_save_new, + weight_name="adapter_model.safetensors", + ) + + else: + model.save_pretrained(output_dir) + + print(f"Lora model has been saved in {output_dir}.") + + if trainer.args.num_train_epochs > 0: + trainer.train() + save_lora_model(model, os.path.join(args.output_dir, "lora")) + + # 5. Inference and evaluation + if args.do_eval: # FIXME: bf16 not supported yet + print("Fuse lora weights into pipe and do eval.") + + # loading function + def load_lora_model(model, parent_model, input_dir): + if args.zero_stage == 3: + import gc + + del model + del parent_model + gc.collect() + ms.hal.empty_cache() + + parent_model = QwenImagePipeline.from_pretrained( + args.model_path, + mindspore_dtype=ms_dtype, + ) + parent_model.load_lora_weights( + input_dir, weight_name="adapter_model.safetensors", adapter_name="qwenimage-lora" + ) + parent_model.fuse_lora() + else: + model.merge_and_unload() # merge LoRA weights into the base model + parent_model.transformer = model.get_base_model() # replace thinker with LoRA-enhanced model + parent_model.set_train(False) + + load_lora_model(model, parent_model, os.path.join(args.output_dir, "lora")) + + # inference function + def inference(txt): + image = parent_model( + prompt=txt, + width=data_args.width, + height=data_args.height, + num_inference_steps=8, + true_cfg_scale=1.0, + generator=np.random.Generator(np.random.PCG64(seed=42)), + )[0][0] + return image + + def calculate_pixel_error(img1, img2): + arr1 = np.array(img1, dtype=np.float32) + arr2 = np.array(img2, dtype=np.float32) + return np.mean(np.abs(arr1 - arr2)) + + for idx, example in enumerate(eval_dataset): + infer_image = inference(example["text"]) + infer_image_save_path = os.path.join(args.output_dir, f"infer_{idx}.png") + infer_image.save(infer_image_save_path) + + ref_image = Image.open(io.BytesIO(example["image"]["bytes"])).convert("RGB").resize((512, 512)) + ref_image_save_path = os.path.join(args.output_dir, f"ref_{idx}.png") + ref_image.save(ref_image_save_path) + error = calculate_pixel_error(infer_image, ref_image) + + log_entry = f"Generation: #{idx} in {infer_image_save_path} with pixel errors {error:.2}\n" + with open(os.path.join(args.output_dir, "results.txt"), "a") as f: + print(log_entry.strip(), file=f) + + +class TrainStepForQwenImage(nn.Cell): + def __init__(self, base_model, vae_decode, scheduler, image_postprocess, data_args): + super().__init__() + self.base_model = base_model + self.vae_decode = vae_decode + self.scheduler = scheduler + self.image_postprocess = image_postprocess + self.args = data_args + + @staticmethod + def _unpack_latents(latents, height, width, vae_scale_factor): + batch_size, _, channels = latents.shape + + # VAE applies 8x compression on images but we must also account for packing which requires + # latent height and width to be divisible by 2. + height = 2 * (int(height) // (vae_scale_factor * 2)) + width = 2 * (int(width) // (vae_scale_factor * 2)) + + latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2) + latents = latents.permute(0, 3, 1, 4, 2, 5) + + latents = latents.reshape(batch_size, channels // (2 * 2), 1, height, width) + + return latents + + def retrieve_timesteps( + self, + scheduler, + num_inference_steps: Optional[int] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + **kwargs, + ): + if timesteps is not None and sigmas is not None: + raise ValueError( + "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values" + ) + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + def calculate_shift( + self, + image_seq_len, + base_seq_len: int = 256, + max_seq_len: int = 4096, + base_shift: float = 0.5, + max_shift: float = 1.15, + ): + m = (max_shift - base_shift) / (max_seq_len - base_seq_len) + b = base_shift - m * base_seq_len + mu = image_seq_len * m + b + return mu + + def construct( + self, + hidden_states, + encoder_hidden_states, + encoder_hidden_states_mask, + txt_seq_lens, + labels, + *args, + ): + # Prapre timesteps + latents = hidden_states + sigmas = np.linspace(1.0, 1 / self.args.num_inference_steps, self.args.num_inference_steps) + image_seq_len = latents.shape[1] + mu = self.calculate_shift( + image_seq_len, + self.scheduler.config.get("base_image_seq_len", 256), + self.scheduler.config.get("max_image_seq_len", 4096), + self.scheduler.config.get("base_shift", 0.5), + self.scheduler.config.get("max_shift", 1.15), + ) + timesteps, _ = self.retrieve_timesteps( + self.scheduler, + self.args.num_inference_steps, + sigmas=sigmas, + mu=mu, + ) + + # Denoising loop + self.scheduler.set_begin_index(0) + for i, t in enumerate(timesteps): + timestep = t.expand((latents.shape[0],)).to(latents.dtype) + noise_pred = self.base_model( + hidden_states=latents, + encoder_hidden_states=encoder_hidden_states, + encoder_hidden_states_mask=encoder_hidden_states_mask, + timestep=timestep / 1000, + img_shapes=[(1, 32, 32)], + txt_seq_lens=txt_seq_lens, + return_dict=False, + )[0] + + latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + + latents = self._unpack_latents(latents, self.args.height, self.args.width, 8) # vae_scale_facotr=8 + latents = latents.to(self.args.ms_dtype) + latents_mean = ( + ms.tensor(self.args.vae_config.latents_mean).view(1, self.args.vae_config.z_dim, 1, 1, 1).to(latents.dtype) + ) + latents_std = 1.0 / ms.tensor(self.args.vae_config.latents_std).view(1, self.args.vae_config.z_dim, 1, 1, 1).to( + latents.dtype + ) + latents = latents / latents_std + latents_mean + preds = self.vae_decode(latents, return_dict=False)[0][:, :, 0] + preds = self.image_postprocess(preds, output_type="ms") + + loss = ms.mint.mean( + ((preds - labels) ** 2).reshape(preds.shape[0], -1), + dim=1, + ) + + return loss + + +if __name__ == "__main__": + main() diff --git a/examples/diffusers/qwenimage/zero3.json b/examples/diffusers/qwenimage/zero3.json new file mode 100644 index 0000000000..a9af51f4f4 --- /dev/null +++ b/examples/diffusers/qwenimage/zero3.json @@ -0,0 +1,5 @@ +{ + "zero_optimization": { + "stage": 3 + } +} diff --git a/mindone/diffusers/models/transformers/transformer_qwenimage.py b/mindone/diffusers/models/transformers/transformer_qwenimage.py index db0b47de8a..e11d8e4892 100644 --- a/mindone/diffusers/models/transformers/transformer_qwenimage.py +++ b/mindone/diffusers/models/transformers/transformer_qwenimage.py @@ -26,7 +26,7 @@ from ...configuration_utils import ConfigMixin, register_to_config from ...loaders import FromOriginalModelMixin, PeftAdapterMixin from ...utils import logging -from ..attention import FeedForward +from ..attention import AttentionMixin, FeedForward from ..attention_processor import Attention from ..embeddings import TimestepEmbedding, Timesteps from ..layers_compat import unflatten, view_as_complex @@ -109,31 +109,32 @@ def apply_rotary_emb_qwen( Returns: Tuple[ms.Tensor, ms.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings. """ - if use_real: - cos, sin = freqs_cis # [S, D] - cos = cos[None, None] - sin = sin[None, None] - - if use_real_unbind_dim == -1: - # Used for flux, cogvideox, hunyuan-dit - x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1) # [B, S, H, D//2] - x_rotated = mint.stack([-x_imag, x_real], dim=-1).flatten(3) - elif use_real_unbind_dim == -2: - # Used for Stable Audio, OmniGen, CogView4 and Cosmos - x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2) # [B, S, H, D//2] - x_rotated = mint.cat([-x_imag, x_real], dim=-1) - else: - raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.") + with ms._no_grad(): # to support training + if use_real: + cos, sin = freqs_cis # [S, D] + cos = cos[None, None] + sin = sin[None, None] + + if use_real_unbind_dim == -1: + # Used for flux, cogvideox, hunyuan-dit + x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1) # [B, S, H, D//2] + x_rotated = mint.stack([-x_imag, x_real], dim=-1).flatten(3) + elif use_real_unbind_dim == -2: + # Used for Stable Audio, OmniGen, CogView4 and Cosmos + x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2) # [B, S, H, D//2] + x_rotated = mint.cat([-x_imag, x_real], dim=-1) + else: + raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.") - out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype) + out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype) - return out - else: - x_rotated = view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2)) - freqs_cis = freqs_cis.unsqueeze(1) - x_out = ops.view_as_real(x_rotated * freqs_cis).flatten(3) + return out + else: + x_rotated = view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2)) + freqs_cis = freqs_cis.unsqueeze(1) + x_out = ops.view_as_real(x_rotated * freqs_cis).flatten(3) - return x_out.type_as(x) + return x_out.type_as(x) class QwenTimestepProjEmbeddings(nn.Cell): @@ -330,7 +331,6 @@ def __call__( return img_attn_output, txt_attn_output -# @jit_class class QwenImageTransformerBlock(nn.Cell): def __init__( self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6 @@ -446,7 +446,7 @@ def construct( return encoder_hidden_states, hidden_states -class QwenImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin): +class QwenImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, AttentionMixin): """ The Transformer model introduced in Qwen. @@ -529,7 +529,7 @@ def construct( txt_seq_lens: Optional[List[int]] = None, guidance: ms.Tensor = None, # TODO: this should probably be removed attention_kwargs: Optional[Dict[str, Any]] = None, - controlnet_block_samples=None, + controlnet_block_samples: ms.Tensor = None, return_dict: bool = True, ) -> Union[ms.Tensor, Transformer2DModelOutput]: """ @@ -586,14 +586,25 @@ def construct( image_rotary_emb = self.pos_embed(img_shapes, txt_seq_lens) for index_block, block in enumerate(self.transformer_blocks): - encoder_hidden_states, hidden_states = block( - hidden_states=hidden_states, - encoder_hidden_states=encoder_hidden_states, - encoder_hidden_states_mask=encoder_hidden_states_mask, - temb=temb, - image_rotary_emb=image_rotary_emb, - joint_attention_kwargs=attention_kwargs, - ) + if self.gradient_checkpointing and self.training: + encoder_hidden_states, hidden_states = ms.recompute( + block, + hidden_states, + encoder_hidden_states, + encoder_hidden_states_mask, + temb, + image_rotary_emb, + attention_kwargs, + ) + else: + encoder_hidden_states, hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + encoder_hidden_states_mask=encoder_hidden_states_mask, + temb=temb, + image_rotary_emb=image_rotary_emb, + joint_attention_kwargs=attention_kwargs, + ) # controlnet residual if controlnet_block_samples is not None: From 13ec825ecbf3352e37f7645795c4b78343dc826f Mon Sep 17 00:00:00 2001 From: Dong1017 Date: Mon, 27 Oct 2025 15:42:47 +0800 Subject: [PATCH 2/8] fix: diffusers merged PR 12170, 12261 --- .../loaders/lora_conversion_utils.py | 61 ++++++++++++------- mindone/diffusers/loaders/lora_pipeline.py | 4 +- 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/mindone/diffusers/loaders/lora_conversion_utils.py b/mindone/diffusers/loaders/lora_conversion_utils.py index aa41648a81..4b2d6f3766 100644 --- a/mindone/diffusers/loaders/lora_conversion_utils.py +++ b/mindone/diffusers/loaders/lora_conversion_utils.py @@ -2304,6 +2304,10 @@ def _convert_non_diffusers_ltxv_lora_to_diffusers(state_dict, non_diffusers_pref def _convert_non_diffusers_qwen_lora_to_diffusers(state_dict): + has_diffusion_model = any(k.startswith("diffusion_model.") for k in state_dict) + if has_diffusion_model: + state_dict = {k.removeprefix("diffusion_model."): v for k, v in state_dict.items()} + has_lora_unet = any(k.startswith("lora_unet_") for k in state_dict) if has_lora_unet: state_dict = {k.removeprefix("lora_unet_"): v for k, v in state_dict.items()} @@ -2376,29 +2380,44 @@ def convert_key(key: str) -> str: all_keys = list(state_dict.keys()) down_key = ".lora_down.weight" up_key = ".lora_up.weight" + a_key = ".lora_A.weight" + b_key = ".lora_B.weight" - def get_alpha_scales(down_weight, alpha_key): - rank = down_weight.shape[0] - alpha = state_dict.pop(alpha_key).item() - scale = alpha / rank # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here - scale_down = scale - scale_up = 1.0 - while scale_down * 2 < scale_up: - scale_down *= 2 - scale_up /= 2 - return scale_down, scale_up + has_non_diffusers_lora_id = any(down_key in k or up_key in k for k in all_keys) + has_diffusers_lora_id = any(a_key in k or b_key in k for k in all_keys) - for k in all_keys: - if k.endswith(down_key): - diffusers_down_key = k.replace(down_key, ".lora_A.weight") - diffusers_up_key = k.replace(down_key, up_key).replace(up_key, ".lora_B.weight") - alpha_key = k.replace(down_key, ".alpha") - - down_weight = state_dict.pop(k) - up_weight = state_dict.pop(k.replace(down_key, up_key)) - scale_down, scale_up = get_alpha_scales(down_weight, alpha_key) - converted_state_dict[diffusers_down_key] = Parameter(down_weight * scale_down) - converted_state_dict[diffusers_up_key] = Parameter(up_weight * scale_up) + if has_non_diffusers_lora_id: + + def get_alpha_scales(down_weight, alpha_key): + rank = down_weight.shape[0] + alpha = state_dict.pop(alpha_key).item() + scale = alpha / rank # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here + scale_down = scale + scale_up = 1.0 + while scale_down * 2 < scale_up: + scale_down *= 2 + scale_up /= 2 + return scale_down, scale_up + + for k in all_keys: + if k.endswith(down_key): + diffusers_down_key = k.replace(down_key, ".lora_A.weight") + diffusers_up_key = k.replace(down_key, up_key).replace(up_key, ".lora_B.weight") + alpha_key = k.replace(down_key, ".alpha") + + down_weight = state_dict.pop(k) + up_weight = state_dict.pop(k.replace(down_key, up_key)) + scale_down, scale_up = get_alpha_scales(down_weight, alpha_key) + converted_state_dict[diffusers_down_key] = Parameter(down_weight * scale_down) + converted_state_dict[diffusers_up_key] = Parameter(up_weight * scale_up) + + # Already in diffusers format (lora_A/lora_B), just pop + elif has_diffusers_lora_id: + for k in all_keys: + if a_key in k or b_key in k: + converted_state_dict[k] = state_dict.pop(k) + elif ".alpha" in k: + state_dict.pop(k) if len(state_dict) > 0: raise ValueError(f"`state_dict` should be empty at this point but has {state_dict.keys()=}") diff --git a/mindone/diffusers/loaders/lora_pipeline.py b/mindone/diffusers/loaders/lora_pipeline.py index 4d91ede7be..eee37acf6c 100644 --- a/mindone/diffusers/loaders/lora_pipeline.py +++ b/mindone/diffusers/loaders/lora_pipeline.py @@ -6175,7 +6175,9 @@ def lora_state_dict( state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k} has_alphas_in_sd = any(k.endswith(".alpha") for k in state_dict) - if has_alphas_in_sd: + has_lora_unet = any(k.startswith("lora_unet_") for k in state_dict) + has_diffusion_model = any(k.startswith("diffusion_model.") for k in state_dict) + if has_alphas_in_sd or has_lora_unet or has_diffusion_model: state_dict = _convert_non_diffusers_qwen_lora_to_diffusers(state_dict) out = (state_dict, metadata) if return_lora_metadata else state_dict From 761c19691c82ea97f1fc6eca550d522112e0552a Mon Sep 17 00:00:00 2001 From: Dong1017 Date: Mon, 27 Oct 2025 16:02:02 +0800 Subject: [PATCH 3/8] fix: diffusers merged PR 12181 --- docs/diffusers/api/pipelines/qwenimage.md | 58 ++++++++++++++++++- .../pipelines/qwenimage/pipeline_qwenimage.py | 5 ++ .../qwenimage/pipeline_qwenimage_edit.py | 5 ++ .../qwenimage/pipeline_qwenimage_img2img.py | 5 ++ .../qwenimage/pipeline_qwenimage_inpaint.py | 5 ++ 5 files changed, 77 insertions(+), 1 deletion(-) diff --git a/docs/diffusers/api/pipelines/qwenimage.md b/docs/diffusers/api/pipelines/qwenimage.md index 73408a656f..8b56faef54 100644 --- a/docs/diffusers/api/pipelines/qwenimage.md +++ b/docs/diffusers/api/pipelines/qwenimage.md @@ -30,7 +30,63 @@ Qwen-Image comes in the following variants: !!! tip Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines. - In addition, the default version of installed `transformers` in `mindone` is `4.50.0`, but `transformers==4.52.1` is required for Qwen-Image. Please using `pip install transformers==4.52.1` to upgrade, if you want to try related Qwen-Image pipelines. + +## LoRA for faster inference + +Use a LoRA from `lightx2v/Qwen-Image-Lightning` to speed up inference by reducing the +number of steps. Refer to the code snippet below: + +```py +from mindone.diffusers import DiffusionPipeline, FlowMatchEulerDiscreteScheduler +import mindspore +import math + +scheduler_config = { + "base_image_seq_len": 256, + "base_shift": math.log(3), # We use shift=3 in distillation + "invert_sigmas": False, + "max_image_seq_len": 8192, + "max_shift": math.log(3), # We use shift=3 in distillation + "num_train_timesteps": 1000, + "shift": 1.0, + "shift_terminal": None, # set shift_terminal to None + "stochastic_sampling": False, + "time_shift_type": "exponential", + "use_beta_sigmas": False, + "use_dynamic_shifting": True, + "use_exponential_sigmas": False, + "use_karras_sigmas": False, +} +scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config) +pipe = DiffusionPipeline.from_pretrained( + "Qwen/Qwen-Image", scheduler=scheduler, mindspore_dtype=mindspore.bfloat16 +) +pipe.load_lora_weights( + "Qwen/lightx2v/Qwen-Image-Lightning", + weight_name="Qwen-Image-Lightning-8steps-V1.0.safetensors", + adapter_name="qwenimage-lora" +) +pipe.fuse_lora() +pipe.unload_lora_weights() + +prompt = "a tiny astronaut hatching from an egg on the moon, Ultra HD, 4K, cinematic composition." +negative_prompt = " " +image = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + width=1024, + height=1024, + num_inference_steps=8, + true_cfg_scale=1.0, + generator=None, +)[0][0] +image.save("lora_pic/qwen_fewsteps_lora.png") +``` + +!!! tip + + The `guidance_scale` parameter in the pipeline is there to support future guidance-distilled models when they come up. + Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should enable classifier-free guidance computations. ::: mindone.diffusers.QwenImagePipeline diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage.py index 64d640c031..5991c1cf23 100644 --- a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage.py +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage.py @@ -474,6 +474,11 @@ def __call__( of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + + This parameter in the pipeline is there to support future guidance-distilled models when they come up. + Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, + please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should + enable classifier-free guidance computations. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`np.random.Generator` or `List[np.random.Generator]`, *optional*): diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py index 7fe3fbbe2d..f6b035cb83 100644 --- a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py @@ -563,6 +563,11 @@ def __call__( enabled by setting `true_cfg_scale > 1` and a provided `negative_prompt`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + + This parameter in the pipeline is there to support future guidance-distilled models when they come up. + Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, + please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should + enable classifier-free guidance computations. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. This is set to 1024 by default for the best results. width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py index 1d3f036ce8..1d4e29f18e 100644 --- a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py @@ -559,6 +559,11 @@ def __call__( of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + + This parameter in the pipeline is there to support future guidance-distilled models when they come up. + Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, + please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should + enable classifier-free guidance computations. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`np.random.Generator` or `List[np.random.Generator]`, *optional*): diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py index 9af4f5b083..b1ad166866 100644 --- a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py @@ -688,6 +688,11 @@ def __call__( of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + + This parameter in the pipeline is there to support future guidance-distilled models when they come up. + Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, + please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should + enable classifier-free guidance computations. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`np.random.Generator` or `List[np.random.Generator]`, *optional*): From 3eef2b58d92d19888049d4829bef1b359074cd89 Mon Sep 17 00:00:00 2001 From: Dong1017 Date: Mon, 27 Oct 2025 16:27:49 +0800 Subject: [PATCH 4/8] fix: diffusers merged PR 12223, controlnet to be added --- .../pipelines/qwenimage/pipeline_qwenimage.py | 51 +++++++++++++------ .../qwenimage/pipeline_qwenimage_edit.py | 5 -- .../qwenimage/pipeline_qwenimage_img2img.py | 51 +++++++++++++------ .../qwenimage/pipeline_qwenimage_inpaint.py | 49 +++++++++++++----- 4 files changed, 107 insertions(+), 49 deletions(-) diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage.py index 5991c1cf23..d830daf946 100644 --- a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage.py +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage.py @@ -429,7 +429,7 @@ def __call__( width: Optional[int] = None, num_inference_steps: int = 50, sigmas: Optional[List[float]] = None, - guidance_scale: float = 1.0, + guidance_scale: Optional[float] = None, num_images_per_prompt: int = 1, generator: Optional[Union[np.random.Generator, List[np.random.Generator]]] = None, latents: Optional[ms.tensor] = None, @@ -456,7 +456,12 @@ def __call__( `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is not greater than `1`). true_cfg_scale (`float`, *optional*, defaults to 1.0): - When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance. + Guidance scale as defined in [Classifier-Free Diffusion + Guidance](https://huggingface.co/papers/2207.12598). `true_cfg_scale` is defined as `w` of equation 2. + of [Imagen Paper](https://huggingface.co/papers/2205.11487). Classifier-free guidance is enabled by + setting `true_cfg_scale > 1` and a provided `negative_prompt`. Higher guidance scale encourages to + generate images that are closely linked to the text `prompt`, usually at the expense of lower image + quality. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. This is set to 1024 by default for the best results. width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): @@ -468,17 +473,16 @@ def __call__( Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. - guidance_scale (`float`, *optional*, defaults to 3.5): - Guidance scale as defined in [Classifier-Free Diffusion - Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2. - of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting - `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to - the text `prompt`, usually at the expense of lower image quality. - - This parameter in the pipeline is there to support future guidance-distilled models when they come up. - Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, - please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should - enable classifier-free guidance computations. + guidance_scale (`float`, *optional*, defaults to None): + A guidance scale value for guidance distilled models. Unlike the traditional classifier-free guidance + where the guidance scale is applied during inference through noise prediction rescaling, guidance + distilled models take the guidance scale directly as an input parameter during forward pass. Guidance + scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images + that are closely linked to the text `prompt`, usually at the expense of lower image quality. This + parameter in the pipeline is there to support future guidance-distilled models when they come up. It is + ignored when not using guidance distilled models. To enable traditional classifier-free guidance, + please pass `true_cfg_scale > 1.0` and `negative_prompt` (even an empty negative prompt like " " should + enable classifier-free guidance computations). num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`np.random.Generator` or `List[np.random.Generator]`, *optional*): @@ -556,6 +560,16 @@ def __call__( has_neg_prompt = negative_prompt is not None or ( negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None ) + + if true_cfg_scale > 1 and not has_neg_prompt: + logger.warning( + f"true_cfg_scale is passed as {true_cfg_scale}, but classifier-free guidance is not enabled since no negative_prompt is provided." + ) + elif true_cfg_scale <= 1 and has_neg_prompt: + logger.warning( + " negative_prompt is passed but classifier-free guidance is not enabled since true_cfg_scale <= 1" + ) + do_true_cfg = true_cfg_scale > 1 and has_neg_prompt prompt_embeds, prompt_embeds_mask = self.encode_prompt( prompt=prompt, @@ -606,10 +620,17 @@ def __call__( self._num_timesteps = len(timesteps) # handle guidance - if self.transformer.config.guidance_embeds: + if self.transformer.config.guidance_embeds and guidance_scale is None: + raise ValueError("guidance_scale is required for guidance-distilled model.") + elif self.transformer.config.guidance_embeds: guidance = mint.full([1], guidance_scale, dtype=ms.float32) guidance = guidance.expand((latents.shape[0],)) - else: + elif not self.transformer.config.guidance_embeds and guidance_scale is not None: + logger.warning( + f"guidance_scale is passed as {guidance_scale}, but ignored since the model is not guidance-distilled." + ) + guidance = None + elif not self.transformer.config.guidance_embeds and guidance_scale is None: guidance = None if self.attention_kwargs is None: diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py index f6b035cb83..7fe3fbbe2d 100644 --- a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py @@ -563,11 +563,6 @@ def __call__( enabled by setting `true_cfg_scale > 1` and a provided `negative_prompt`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. - - This parameter in the pipeline is there to support future guidance-distilled models when they come up. - Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, - please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should - enable classifier-free guidance computations. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. This is set to 1024 by default for the best results. width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py index 1d4e29f18e..035648ec0d 100644 --- a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py @@ -502,7 +502,7 @@ def __call__( strength: float = 0.6, num_inference_steps: int = 50, sigmas: Optional[List[float]] = None, - guidance_scale: float = 1.0, + guidance_scale: Optional[float] = None, num_images_per_prompt: int = 1, generator: Optional[Union[np.random.Generator, List[np.random.Generator]]] = None, latents: Optional[ms.tensor] = None, @@ -535,7 +535,12 @@ def __call__( list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but if passing latents directly it is not encoded again. true_cfg_scale (`float`, *optional*, defaults to 1.0): - When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance. + Guidance scale as defined in [Classifier-Free Diffusion + Guidance](https://huggingface.co/papers/2207.12598). `true_cfg_scale` is defined as `w` of equation 2. + of [Imagen Paper](https://huggingface.co/papers/2205.11487). Classifier-free guidance is enabled by + setting `true_cfg_scale > 1` and a provided `negative_prompt`. Higher guidance scale encourages to + generate images that are closely linked to the text `prompt`, usually at the expense of lower image + quality. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. This is set to 1024 by default for the best results. width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): @@ -553,17 +558,16 @@ def __call__( Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. - guidance_scale (`float`, *optional*, defaults to 3.5): - Guidance scale as defined in [Classifier-Free Diffusion - Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2. - of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting - `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to - the text `prompt`, usually at the expense of lower image quality. - - This parameter in the pipeline is there to support future guidance-distilled models when they come up. - Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, - please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should - enable classifier-free guidance computations. + guidance_scale (`float`, *optional*, defaults to None): + A guidance scale value for guidance distilled models. Unlike the traditional classifier-free guidance + where the guidance scale is applied during inference through noise prediction rescaling, guidance + distilled models take the guidance scale directly as an input parameter during forward pass. Guidance + scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images + that are closely linked to the text `prompt`, usually at the expense of lower image quality. This + parameter in the pipeline is there to support future guidance-distilled models when they come up. It is + ignored when not using guidance distilled models. To enable traditional classifier-free guidance, + please pass `true_cfg_scale > 1.0` and `negative_prompt` (even an empty negative prompt like " " should + enable classifier-free guidance computations). num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`np.random.Generator` or `List[np.random.Generator]`, *optional*): @@ -646,6 +650,16 @@ def __call__( has_neg_prompt = negative_prompt is not None or ( negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None ) + + if true_cfg_scale > 1 and not has_neg_prompt: + logger.warning( + f"true_cfg_scale is passed as {true_cfg_scale}, but classifier-free guidance is not enabled since no negative_prompt is provided." + ) + elif true_cfg_scale <= 1 and has_neg_prompt: + logger.warning( + " negative_prompt is passed but classifier-free guidance is not enabled since true_cfg_scale <= 1" + ) + do_true_cfg = true_cfg_scale > 1 and has_neg_prompt prompt_embeds, prompt_embeds_mask = self.encode_prompt( prompt=prompt, @@ -706,10 +720,17 @@ def __call__( self._num_timesteps = len(timesteps) # handle guidance - if self.transformer.config.guidance_embeds: + if self.transformer.config.guidance_embeds and guidance_scale is None: + raise ValueError("guidance_scale is required for guidance-distilled model.") + elif self.transformer.config.guidance_embeds: guidance = mint.full([1], guidance_scale, dtype=ms.float32) guidance = guidance.expand((latents.shape[0],)) - else: + elif not self.transformer.config.guidance_embeds and guidance_scale is not None: + logger.warning( + f"guidance_scale is passed as {guidance_scale}, but ignored since the model is not guidance-distilled." + ) + guidance = None + elif not self.transformer.config.guidance_embeds and guidance_scale is None: guidance = None if self.attention_kwargs is None: diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py index b1ad166866..de89eaf5b5 100644 --- a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py @@ -614,7 +614,7 @@ def __call__( strength: float = 0.6, num_inference_steps: int = 50, sigmas: Optional[List[float]] = None, - guidance_scale: float = 1.0, + guidance_scale: Optional[float] = None, num_images_per_prompt: int = 1, generator: Optional[Union[np.random.Generator, List[np.random.Generator]]] = None, latents: Optional[ms.tensor] = None, @@ -647,7 +647,12 @@ def __call__( list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but if passing latents directly it is not encoded again. true_cfg_scale (`float`, *optional*, defaults to 1.0): - When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance. + Guidance scale as defined in [Classifier-Free Diffusion + Guidance](https://huggingface.co/papers/2207.12598). `true_cfg_scale` is defined as `w` of equation 2. + of [Imagen Paper](https://huggingface.co/papers/2205.11487). Classifier-free guidance is enabled by + setting `true_cfg_scale > 1` and a provided `negative_prompt`. Higher guidance scale encourages to + generate images that are closely linked to the text `prompt`, usually at the expense of lower image + quality. mask_image (`ms.tensor`, `PIL.Image.Image`, `np.ndarray`, `List[ms.tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a @@ -683,16 +688,15 @@ def __call__( their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. guidance_scale (`float`, *optional*, defaults to 3.5): - Guidance scale as defined in [Classifier-Free Diffusion - Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2. - of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting - `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to - the text `prompt`, usually at the expense of lower image quality. - - This parameter in the pipeline is there to support future guidance-distilled models when they come up. - Note that passing `guidance_scale` to the pipeline is ineffective. To enable classifier-free guidance, - please pass `true_cfg_scale` and `negative_prompt` (even an empty negative prompt like " ") should - enable classifier-free guidance computations. + A guidance scale value for guidance distilled models. Unlike the traditional classifier-free guidance + where the guidance scale is applied during inference through noise prediction rescaling, guidance + distilled models take the guidance scale directly as an input parameter during forward pass. Guidance + scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images + that are closely linked to the text `prompt`, usually at the expense of lower image quality. This + parameter in the pipeline is there to support future guidance-distilled models when they come up. It is + ignored when not using guidance distilled models. To enable traditional classifier-free guidance, + please pass `true_cfg_scale > 1.0` and `negative_prompt` (even an empty negative prompt like " " should + enable classifier-free guidance computations). num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`np.random.Generator` or `List[np.random.Generator]`, *optional*): @@ -789,6 +793,16 @@ def __call__( has_neg_prompt = negative_prompt is not None or ( negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None ) + + if true_cfg_scale > 1 and not has_neg_prompt: + logger.warning( + f"true_cfg_scale is passed as {true_cfg_scale}, but classifier-free guidance is not enabled since no negative_prompt is provided." + ) + elif true_cfg_scale <= 1 and has_neg_prompt: + logger.warning( + " negative_prompt is passed but classifier-free guidance is not enabled since true_cfg_scale <= 1" + ) + do_true_cfg = true_cfg_scale > 1 and has_neg_prompt prompt_embeds, prompt_embeds_mask = self.encode_prompt( prompt=prompt, @@ -873,10 +887,17 @@ def __call__( self._num_timesteps = len(timesteps) # handle guidance - if self.transformer.config.guidance_embeds: + if self.transformer.config.guidance_embeds and guidance_scale is None: + raise ValueError("guidance_scale is required for guidance-distilled model.") + elif self.transformer.config.guidance_embeds: guidance = mint.full([1], guidance_scale, dtype=ms.float32) guidance = guidance.expand((latents.shape[0],)) - else: + elif not self.transformer.config.guidance_embeds and guidance_scale is not None: + logger.warning( + f"guidance_scale is passed as {guidance_scale}, but ignored since the model is not guidance-distilled." + ) + guidance = None + elif not self.transformer.config.guidance_embeds and guidance_scale is None: guidance = None if self.attention_kwargs is None: From 69464c1a11a40182982b6bb85c203c2c58871b44 Mon Sep 17 00:00:00 2001 From: Dong1017 Date: Mon, 27 Oct 2025 16:37:09 +0800 Subject: [PATCH 5/8] Revised: lora finetune script according to gemini --- .../finetune_lora_with_mindspore_trainer.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/examples/diffusers/qwenimage/finetune_lora_with_mindspore_trainer.py b/examples/diffusers/qwenimage/finetune_lora_with_mindspore_trainer.py index 540a0152cb..fe2057e84d 100644 --- a/examples/diffusers/qwenimage/finetune_lora_with_mindspore_trainer.py +++ b/examples/diffusers/qwenimage/finetune_lora_with_mindspore_trainer.py @@ -93,6 +93,7 @@ class MyArguments(MindSporeArguments, TrainingArguments): ) # no use resume: Union[bool, str] = field(default=False, metadata={"help": "Resume training from a checkpoint."}) save_strategy: str = field(default="no", metadata={"help": "Save strategy, no, steps or epoch"}) + seed: int = field(default=42) @dataclass @@ -135,12 +136,20 @@ def main(): # 1.2 the dataset dataset = load_dataset("parquet", data_dir=args.dataset_path, split="train") - dataset = dataset.shuffle(seed=42) - train_indices = list(range(666)) - eval_indices = list(range(666, 833)) + dataset = dataset.shuffle(seed=args.seed) + + total_size = len(dataset) + train_size = int(total_size * 0.8) + + train_indices = list(range(train_size)) + eval_indices = list(range(train_size, total_size)) def process_function(examples): - image = Image.open(io.BytesIO(examples["image"]["bytes"])).convert("RGB").resize((512, 512)) + image = ( + Image.open(io.BytesIO(examples["image"]["bytes"])) + .convert("RGB") + .resize((data_args.width, data_args.height)) + ) txt = examples["text"] # prepare the inputs @@ -154,7 +163,7 @@ def process_function(examples): height=height, width=width, dtype=encoder_hidden_states.dtype, - generator=np.random.Generator(np.random.PCG64(seed=42)), + generator=np.random.Generator(np.random.PCG64(seed=args.seed)), latents=None, ) @@ -353,7 +362,7 @@ def inference(txt): height=data_args.height, num_inference_steps=8, true_cfg_scale=1.0, - generator=np.random.Generator(np.random.PCG64(seed=42)), + generator=np.random.Generator(np.random.PCG64(seed=args.seed)), )[0][0] return image @@ -488,7 +497,7 @@ def construct( encoder_hidden_states=encoder_hidden_states, encoder_hidden_states_mask=encoder_hidden_states_mask, timestep=timestep / 1000, - img_shapes=[(1, 32, 32)], + img_shapes=[(1, self.args.height // 16, self.args.width // 16)], txt_seq_lens=txt_seq_lens, return_dict=False, )[0] From 764b878d1baab6a5df226bd17c46db41c7cec364 Mon Sep 17 00:00:00 2001 From: GUOGUO <55723162+Dong1017@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:37:47 +0800 Subject: [PATCH 6/8] feat: QwenImageEditPlus - diffusers merged PR 12357 --- mindone/diffusers/__init__.py | 6 +- mindone/diffusers/pipelines/__init__.py | 8 +- .../diffusers/pipelines/qwenimage/__init__.py | 10 +- .../qwenimage/pipeline_qwenimage_edit.py | 4 +- .../pipeline_qwenimage_edit_inpaint.py | 8 +- .../qwenimage/pipeline_qwenimage_edit_plus.py | 859 ++++++++++++++++++ .../qwenimage/pipeline_qwenimage_img2img.py | 2 +- .../qwenimage/pipeline_qwenimage_inpaint.py | 6 +- 8 files changed, 886 insertions(+), 17 deletions(-) create mode 100644 mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py diff --git a/mindone/diffusers/__init__.py b/mindone/diffusers/__init__.py index d37b0e4668..c460af575f 100644 --- a/mindone/diffusers/__init__.py +++ b/mindone/diffusers/__init__.py @@ -263,11 +263,12 @@ "PixArtAlphaPipeline", "PixArtSigmaPAGPipeline", "PixArtSigmaPipeline", + "QwenImageEditInpaintPipeline", + "QwenImageEditPipeline", + "QwenImageEditPlusPipeline", "QwenImageImg2ImgPipeline", "QwenImageInpaintPipeline", "QwenImagePipeline", - "QwenImageEditPipeline", - "QwenImageEditInpaintPipeline", "ReduxImageEncoder", "SanaControlNetPipeline", "SanaPAGPipeline", @@ -657,6 +658,7 @@ PixArtSigmaPipeline, QwenImageEditInpaintPipeline, QwenImageEditPipeline, + QwenImageEditPlusPipeline, QwenImageImg2ImgPipeline, QwenImageInpaintPipeline, QwenImagePipeline, diff --git a/mindone/diffusers/pipelines/__init__.py b/mindone/diffusers/pipelines/__init__.py index bcf6d5ff54..9962c4a5b5 100644 --- a/mindone/diffusers/pipelines/__init__.py +++ b/mindone/diffusers/pipelines/__init__.py @@ -183,11 +183,12 @@ "PixArtSigmaPipeline", ], "qwenimage": [ - "QwenImageEditPipeline", - "QwenImageEditInpaintPipeline", + "QwenImagePipeline", "QwenImageImg2ImgPipeline", "QwenImageInpaintPipeline", - "QwenImagePipeline", + "QwenImageEditPipeline", + "QwenImageEditPlusPipeline", + "QwenImageEditInpaintPipeline", ], "sana": ["SanaPipeline", "SanaSprintPipeline", "SanaControlNetPipeline", "SanaSprintImg2ImgPipeline"], "semantic_stable_diffusion": ["SemanticStableDiffusionPipeline"], @@ -425,6 +426,7 @@ from .qwenimage import ( QwenImageEditInpaintPipeline, QwenImageEditPipeline, + QwenImageEditPlusPipeline, QwenImageImg2ImgPipeline, QwenImageInpaintPipeline, QwenImagePipeline, diff --git a/mindone/diffusers/pipelines/qwenimage/__init__.py b/mindone/diffusers/pipelines/qwenimage/__init__.py index 269d1b92e6..6ca55337c5 100644 --- a/mindone/diffusers/pipelines/qwenimage/__init__.py +++ b/mindone/diffusers/pipelines/qwenimage/__init__.py @@ -7,16 +7,22 @@ _import_structure = { "modeling_qwenimage": ["ReduxImageEncoder"], "pipeline_qwenimage": ["QwenImagePipeline"], - "pipeline_qwenimage_img2img": ["QwenImageImg2ImgPipeline"], - "pipeline_qwenimage_inpaint": ["QwenImageInpaintPipeline"], + "pipeline_qwenimage_controlnet": ["QwenImageControlNetPipeline"], + "pipeline_qwenimage_controlnet_inpaint": ["QwenImageControlNetInpaintPipeline"], "pipeline_qwenimage_edit": ["QwenImageEditPipeline"], "pipeline_qwenimage_edit_inpaint": ["QwenImageEditInpaintPipeline"], + "pipeline_qwenimage_edit_plus": ["QwenImageEditPlusPipeline"], + "pipeline_qwenimage_img2img": ["QwenImageImg2ImgPipeline"], + "pipeline_qwenimage_inpaint": ["QwenImageInpaintPipeline"], } if TYPE_CHECKING: from .pipeline_qwenimage import QwenImagePipeline + from .pipeline_qwenimage_controlnet import QwenImageControlNetPipeline + from .pipeline_qwenimage_controlnet_inpaint import QwenImageControlNetInpaintPipeline from .pipeline_qwenimage_edit import QwenImageEditPipeline from .pipeline_qwenimage_edit_inpaint import QwenImageEditInpaintPipeline + from .pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline from .pipeline_qwenimage_img2img import QwenImageImg2ImgPipeline from .pipeline_qwenimage_inpaint import QwenImageInpaintPipeline else: diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py index 7fe3fbbe2d..4c1bf98f14 100644 --- a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py @@ -545,7 +545,7 @@ def __call__( Args: image (`ms.tensor`, `PIL.Image.Image`, `np.ndarray`, `List[ms.tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both - numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list + numpy array and mindspore tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but if passing latents directly it is not encoded again. @@ -664,7 +664,7 @@ def __call__( batch_size = prompt_embeds.shape[0] # 3. Preprocess image - if image is not None and not (isinstance(image, ms.Tensor) and image.size[1] == self.latent_channels): + if image is not None and not (isinstance(image, ms.tensor) and image.size[1] == self.latent_channels): image = self.image_processor.resize(image, calculated_height, calculated_width) prompt_image = image image = self.image_processor.preprocess(image, calculated_height, calculated_width) diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py index 320b43eb4d..521f0fa791 100644 --- a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py @@ -43,7 +43,7 @@ EXAMPLE_DOC_STRING = """ Examples: ```py - >>> import torch + >>> import mindspore >>> from PIL import Image >>> from mindone.diffusers import QwenImageEditInpaintPipeline >>> from mindone.diffusers.utils import load_image @@ -680,7 +680,7 @@ def __call__( Args: image (`ms.tensor`, `PIL.Image.Image`, `np.ndarray`, `List[ms.tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both - numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list + numpy array and mindspore tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but if passing latents directly it is not encoded again. @@ -701,8 +701,8 @@ def __call__( mask_image (`ms.tensor`, `PIL.Image.Image`, `np.ndarray`, `List[ms.tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a - single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one - color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, + single channel (luminance) before use. If it's a numpy array or mindspore tensor, it should contain one + color channel (L) instead of 3, so the expected shape for mindspore tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. mask_image_latent (`ms.tensor`, `List[ms.tensor]`): diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py new file mode 100644 index 0000000000..cab0b1d483 --- /dev/null +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py @@ -0,0 +1,859 @@ +# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved. +# +# This code is adapted from https://github.com/huggingface/diffusers +# with modifications to run diffusers on mindspore. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import math +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +from transformers import Qwen2Tokenizer, Qwen2VLProcessor + +import mindspore as ms +from mindspore import mint + +from ....transformers import Qwen2_5_VLForConditionalGeneration +from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...loaders import QwenImageLoraLoaderMixin +from ...models import AutoencoderKLQwenImage, QwenImageTransformer2DModel +from ...schedulers import FlowMatchEulerDiscreteScheduler +from ...utils import logging +from ...utils.mindspore_utils import randn_tensor +from ..pipeline_utils import DiffusionPipeline +from .pipeline_output import QwenImagePipelineOutput + +XLA_AVAILABLE = False + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import mindspore + >>> from PIL import Image + >>> from mindone.diffusers import QwenImageEditPlusPipeline + >>> from mindone.diffusers.utils import load_image + + >>> pipe = QwenImageEditPlusPipeline.from_pretrained("Qwen/Qwen-Image-Edit-2509", mindspore_dtype=mindspore.bfloat16) + >>> image = load_image( + ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png" + ... ).convert("RGB") + >>> prompt = ( + ... "Make Pikachu hold a sign that says 'Qwen Edit is awesome', yarn art style, detailed, vibrant colors" + ... ) + >>> # Depending on the variant being used, the pipeline call will slightly vary. + >>> # Refer to the pipeline documentation for more details. + >>> image = pipe(image, prompt, num_inference_steps=50)[0][0] + >>> image.save("qwenimage_edit_plus.png") + ``` +""" + +CONDITION_IMAGE_SIZE = 384 * 384 +VAE_IMAGE_SIZE = 1024 * 1024 + + +# Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.calculate_shift +def calculate_shift( + image_seq_len, + base_seq_len: int = 256, + max_seq_len: int = 4096, + base_shift: float = 0.5, + max_shift: float = 1.15, +): + m = (max_shift - base_shift) / (max_seq_len - base_seq_len) + b = base_shift - m * base_seq_len + mu = image_seq_len * m + b + return mu + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + **kwargs, +): + r""" + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + timesteps (`List[int]`, *optional*): + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. + + Returns: + `Tuple[ms.tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents +def retrieve_latents( + vae, encoder_output: ms.tensor, generator: Optional[np.random.Generator] = None, sample_mode: str = "sample" +): + if sample_mode == "sample": + return vae.diag_gauss_dist.sample(encoder_output, generator=generator) + elif sample_mode == "argmax": + return vae.diag_gauss_dist.mode(encoder_output) + # This brach is not needed because the encoder_output type is ms.tensor as per AutoencoderKLOuput change + # elif hasattr(encoder_output, "latents"): + # return encoder_output.latents + else: + return encoder_output + + +def calculate_dimensions(target_area, ratio): + width = math.sqrt(target_area * ratio) + height = width / ratio + + width = round(width / 32) * 32 + height = round(height / 32) * 32 + + return width, height + + +class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin): + r""" + The Qwen-Image-Edit pipeline for image editing. + + Args: + transformer ([`QwenImageTransformer2DModel`]): + Conditional Transformer (MMDiT) architecture to denoise the encoded image latents. + scheduler ([`FlowMatchEulerDiscreteScheduler`]): + A scheduler to be used in combination with `transformer` to denoise the encoded image latents. + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`Qwen2.5-VL-7B-Instruct`]): + [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct), specifically the + [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) variant. + tokenizer (`QwenTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer). + """ + + model_cpu_offload_seq = "text_encoder->transformer->vae" + _callback_tensor_inputs = ["latents", "prompt_embeds"] + + def __init__( + self, + scheduler: FlowMatchEulerDiscreteScheduler, + vae: AutoencoderKLQwenImage, + text_encoder: Qwen2_5_VLForConditionalGeneration, + tokenizer: Qwen2Tokenizer, + processor: Qwen2VLProcessor, + transformer: QwenImageTransformer2DModel, + ): + super().__init__() + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + processor=processor, + transformer=transformer, + scheduler=scheduler, + ) + self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8 + self.latent_channels = self.vae.config.z_dim if getattr(self, "vae", None) else 16 + # QwenImage latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible + # by the patch size. So the vae scale factor is multiplied by the patch size to account for this + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2) + self.tokenizer_max_length = 1024 + + self.prompt_template_encode = ( + "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, " + "background), then explain how the user's text instruction should alter or modify the image. Generate a new " + "image that meets the user's requirements while maintaining consistency with the original input where appropriate" + ".<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + ) + self.prompt_template_encode_start_idx = 64 + self.default_sample_size = 128 + + # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._extract_masked_hidden + def _extract_masked_hidden(self, hidden_states: ms.tensor, mask: ms.tensor): + bool_mask = mask.bool() + valid_lengths = bool_mask.sum(dim=1) + selected = hidden_states[bool_mask] + split_result = mint.split(selected, valid_lengths.tolist(), dim=0) + + return split_result + + def _get_qwen_prompt_embeds( + self, + prompt: Union[str, List[str]] = None, + image: Optional[ms.tensor] = None, + dtype: Optional[ms.dtype] = None, + ): + dtype = dtype or self.text_encoder.dtype + + prompt = [prompt] if isinstance(prompt, str) else prompt + img_prompt_template = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>" + if isinstance(image, list): + base_img_prompt = "" + for i, img in enumerate(image): + base_img_prompt += img_prompt_template.format(i + 1) + elif image is not None: + base_img_prompt = img_prompt_template.format(1) + else: + base_img_prompt = "" + + template = self.prompt_template_encode + + drop_idx = self.prompt_template_encode_start_idx + txt = [template.format(base_img_prompt + e) for e in prompt] + + model_inputs = self.processor( + text=txt, + images=image, + padding=True, + return_tensors="np", + ) + + outputs = self.text_encoder( + input_ids=ms.tensor(model_inputs.input_ids), + attention_mask=ms.tensor(model_inputs.attention_mask), + pixel_values=ms.tensor(model_inputs.pixel_values), + image_grid_thw=ms.tensor(model_inputs.image_grid_thw), + output_hidden_states=True, + ) + + hidden_states = outputs.hidden_states[-1] + split_hidden_states = self._extract_masked_hidden(hidden_states, ms.tensor(model_inputs.attention_mask)) + split_hidden_states = [e[drop_idx:] for e in split_hidden_states] + attn_mask_list = [mint.ones(e.shape[0], dtype=ms.int64) for e in split_hidden_states] + max_seq_len = max([e.shape[0] for e in split_hidden_states]) + prompt_embeds = mint.stack( + [mint.cat([u, u.new_zeros((max_seq_len - u.shape[0], u.shape[1]))]) for u in split_hidden_states] + ) + encoder_attention_mask = mint.stack( + [mint.cat([u, u.new_zeros((max_seq_len - u.shape[0]))]) for u in attn_mask_list] + ) + + prompt_embeds = prompt_embeds.to(dtype=dtype) + + return prompt_embeds, encoder_attention_mask + + # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit.QwenImageEditPipeline.encode_prompt + def encode_prompt( + self, + prompt: Union[str, List[str]], + image: Optional[ms.tensor] = None, + num_images_per_prompt: int = 1, + prompt_embeds: Optional[ms.tensor] = None, + prompt_embeds_mask: Optional[ms.tensor] = None, + max_sequence_length: int = 1024, + ): + r""" + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + image (`ms.tensor`, *optional*): + image to be encoded + num_images_per_prompt (`int`): + number of images that should be generated per prompt + prompt_embeds (`ms.tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + """ + prompt = [prompt] if isinstance(prompt, str) else prompt + batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0] + + if prompt_embeds is None: + prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, image) + + _, seq_len, _ = prompt_embeds.shape + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1) + prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len) + + return prompt_embeds, prompt_embeds_mask + + # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit.QwenImageEditPipeline.check_inputs + def check_inputs( + self, + prompt, + height, + width, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + prompt_embeds_mask=None, + negative_prompt_embeds_mask=None, + callback_on_step_end_tensor_inputs=None, + max_sequence_length=None, + ): + if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0: + logger.warning( + f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}." + " Dimensions will be resized accordingly" + ) + + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found" + f" {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and prompt_embeds_mask is None: + raise ValueError( + "If `prompt_embeds` are provided, `prompt_embeds_mask` also have to be passed. Make sure to generate `prompt_embeds_mask`" + " from the same text encoder that was used to generate `prompt_embeds`." + ) + + if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None: + raise ValueError( + "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_mask` also have to be passed. Make sure to generate" + " `negative_prompt_embeds_mask` from the same text encoder that was used to generate `negative_prompt_embeds`." + ) + + if max_sequence_length is not None and max_sequence_length > 1024: + raise ValueError(f"`max_sequence_length` cannot be greater than 1024 but is {max_sequence_length}") + + @staticmethod + # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._pack_latents + def _pack_latents(latents, batch_size, num_channels_latents, height, width): + latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2) + latents = latents.permute(0, 2, 4, 1, 3, 5) + latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4) + + return latents + + @staticmethod + # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._unpack_latents + def _unpack_latents(latents, height, width, vae_scale_factor): + batch_size, num_patches, channels = latents.shape + + # VAE applies 8x compression on images but we must also account for packing which requires + # latent height and width to be divisible by 2. + height = 2 * (int(height) // (vae_scale_factor * 2)) + width = 2 * (int(width) // (vae_scale_factor * 2)) + + latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2) + latents = latents.permute(0, 3, 1, 4, 2, 5) + + latents = latents.reshape(batch_size, channels // (2 * 2), 1, height, width) + + return latents + + # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit.QwenImageEditPipeline._encode_vae_image + def _encode_vae_image(self, image: ms.tensor, generator: np.random.Generator): + if isinstance(generator, list): + image_latents = [ + retrieve_latents(self.vae, self.vae.encode(image[i : i + 1])[0], sample_mode="argmax") + for i in range(image.shape[0]) + ] + image_latents = mint.cat(image_latents, dim=0) + else: + image_latents = retrieve_latents(self.vae, self.vae.encode(image)[0], sample_mode="argmax") + + latents_mean = ( + ms.tensor(self.vae.config.latents_mean).view(1, self.latent_channels, 1, 1, 1).to(image_latents.dtype) + ) + latents_std = ( + ms.tensor(self.vae.config.latents_std).view(1, self.latent_channels, 1, 1, 1).to(image_latents.dtype) + ) + image_latents = (image_latents - latents_mean) / latents_std + + return image_latents + + def prepare_latents( + self, + images, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): + # VAE applies 8x compression on images but we must also account for packing which requires + # latent height and width to be divisible by 2. + height = 2 * (int(height) // (self.vae_scale_factor * 2)) + width = 2 * (int(width) // (self.vae_scale_factor * 2)) + + shape = (batch_size, 1, num_channels_latents, height, width) + + image_latents = None + if images is not None: + if not isinstance(images, list): + images = [images] + all_image_latents = [] + for image in images: + image = image.to(dtype=dtype) + if image.shape[1] != self.latent_channels: + image_latents = self._encode_vae_image(image=image, generator=generator) + else: + image_latents = image + if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0: + # expand init_latents for batch_size + additional_image_per_prompt = batch_size // image_latents.shape[0] + image_latents = mint.cat([image_latents] * additional_image_per_prompt, dim=0) + elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts." + ) + else: + image_latents = mint.cat([image_latents], dim=0) + + image_latent_height, image_latent_width = image_latents.shape[3:] + image_latents = self._pack_latents( + image_latents, batch_size, num_channels_latents, image_latent_height, image_latent_width + ) + all_image_latents.append(image_latents) + image_latents = mint.cat(all_image_latents, dim=1) + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + if latents is None: + latents = randn_tensor(shape, generator=generator, dtype=dtype) + latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width) + else: + latents = latents.to(dtype=dtype) + + return latents, image_latents + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def attention_kwargs(self): + return self._attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @property + def current_timestep(self): + return self._current_timestep + + @property + def interrupt(self): + return self._interrupt + + def __call__( + self, + image: Optional[PipelineImageInput] = None, + prompt: Union[str, List[str]] = None, + negative_prompt: Union[str, List[str]] = None, + true_cfg_scale: float = 4.0, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + sigmas: Optional[List[float]] = None, + guidance_scale: Optional[float] = None, + num_images_per_prompt: int = 1, + generator: Optional[Union[np.random.Generator, List[np.random.Generator]]] = None, + latents: Optional[ms.tensor] = None, + prompt_embeds: Optional[ms.tensor] = None, + prompt_embeds_mask: Optional[ms.tensor] = None, + negative_prompt_embeds: Optional[ms.tensor] = None, + negative_prompt_embeds_mask: Optional[ms.tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + attention_kwargs: Optional[Dict[str, Any]] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + max_sequence_length: int = 512, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + image (`ms.tensor`, `PIL.Image.Image`, `np.ndarray`, `List[ms.tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both + numpy array and mindspore tensor, the expected value range is between `[0, 1]` If it's a tensor or a list + or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a + list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image + latents as `image`, but if passing latents directly it is not encoded again. + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is + not greater than `1`). + true_cfg_scale (`float`, *optional*, defaults to 1.0): + true_cfg_scale (`float`, *optional*, defaults to 1.0): Guidance scale as defined in [Classifier-Free + Diffusion Guidance](https://huggingface.co/papers/2207.12598). `true_cfg_scale` is defined as `w` of + equation 2. of [Imagen Paper](https://huggingface.co/papers/2205.11487). Classifier-free guidance is + enabled by setting `true_cfg_scale > 1` and a provided `negative_prompt`. Higher guidance scale + encourages to generate images that are closely linked to the text `prompt`, usually at the expense of + lower image quality. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. This is set to 1024 by default for the best results. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. This is set to 1024 by default for the best results. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. + guidance_scale (`float`, *optional*, defaults to None): + A guidance scale value for guidance distilled models. Unlike the traditional classifier-free guidance + where the guidance scale is applied during inference through noise prediction rescaling, guidance + distilled models take the guidance scale directly as an input parameter during forward pass. Guidance + scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images + that are closely linked to the text `prompt`, usually at the expense of lower image quality. This + parameter in the pipeline is there to support future guidance-distilled models when they come up. It is + ignored when not using guidance distilled models. To enable traditional classifier-free guidance, + please pass `true_cfg_scale > 1.0` and `negative_prompt` (even an empty negative prompt like " " should + enable classifier-free guidance computations). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`np.random.Generator` or `List[np.random.Generator]`, *optional*): + One or a list of [numpy generator(s)](https://numpy.org/doc/stable/reference/random/generator.html) + to make generation deterministic. + latents (`ms.tensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will be generated by sampling using the supplied random `generator`. + prompt_embeds (`ms.tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`ms.tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.qwenimage.QwenImagePipelineOutput`] instead of a plain tuple. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`. + + Examples: + + Returns: + [`~pipelines.qwenimage.QwenImagePipelineOutput`] or `tuple`: + [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When + returning a tuple, the first element is a list with the generated images. + """ + image_size = image[-1].size if isinstance(image, list) else image.size + calculated_width, calculated_height = calculate_dimensions(1024 * 1024, image_size[0] / image_size[1]) + height = height or calculated_height + width = width or calculated_width + + multiple_of = self.vae_scale_factor * 2 + width = width // multiple_of * multiple_of + height = height // multiple_of * multiple_of + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + height, + width, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + prompt_embeds_mask=prompt_embeds_mask, + negative_prompt_embeds_mask=negative_prompt_embeds_mask, + callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs, + max_sequence_length=max_sequence_length, + ) + + self._guidance_scale = guidance_scale + self._attention_kwargs = attention_kwargs + self._current_timestep = None + self._interrupt = False + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + # 3. Preprocess image + if image is not None and not (isinstance(image, ms.tensor) and image.size[1] == self.latent_channels): + if not isinstance(image, list): + image = [image] + condition_image_sizes = [] + condition_images = [] + vae_image_sizes = [] + vae_images = [] + for img in image: + image_width, image_height = img.size + condition_width, condition_height = calculate_dimensions( + CONDITION_IMAGE_SIZE, image_width / image_height + ) + vae_width, vae_height = calculate_dimensions(VAE_IMAGE_SIZE, image_width / image_height) + condition_image_sizes.append((condition_width, condition_height)) + vae_image_sizes.append((vae_width, vae_height)) + condition_images.append(self.image_processor.resize(img, condition_height, condition_width)) + vae_images.append(self.image_processor.preprocess(img, vae_height, vae_width).unsqueeze(2)) + + has_neg_prompt = negative_prompt is not None or ( + negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None + ) + + if true_cfg_scale > 1 and not has_neg_prompt: + logger.warning( + f"true_cfg_scale is passed as {true_cfg_scale}, but classifier-free guidance is not enabled since no negative_prompt is provided." + ) + elif true_cfg_scale <= 1 and has_neg_prompt: + logger.warning( + " negative_prompt is passed but classifier-free guidance is not enabled since true_cfg_scale <= 1" + ) + + do_true_cfg = true_cfg_scale > 1 and has_neg_prompt + prompt_embeds, prompt_embeds_mask = self.encode_prompt( + image=condition_images, + prompt=prompt, + prompt_embeds=prompt_embeds, + prompt_embeds_mask=prompt_embeds_mask, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + ) + if do_true_cfg: + negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt( + image=condition_images, + prompt=negative_prompt, + prompt_embeds=negative_prompt_embeds, + prompt_embeds_mask=negative_prompt_embeds_mask, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + ) + + # 4. Prepare latent variables + num_channels_latents = self.transformer.config.in_channels // 4 + latents, image_latents = self.prepare_latents( + vae_images, + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + generator, + latents, + ) + img_shapes = [ + [ + (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2), + *[ + (1, vae_height // self.vae_scale_factor // 2, vae_width // self.vae_scale_factor // 2) + for vae_width, vae_height in vae_image_sizes + ], + ] + ] * batch_size + + # 5. Prepare timesteps + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas + image_seq_len = latents.shape[1] + mu = calculate_shift( + image_seq_len, + self.scheduler.config.get("base_image_seq_len", 256), + self.scheduler.config.get("max_image_seq_len", 4096), + self.scheduler.config.get("base_shift", 0.5), + self.scheduler.config.get("max_shift", 1.15), + ) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, + num_inference_steps, + sigmas=sigmas, + mu=mu, + ) + num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) + self._num_timesteps = len(timesteps) + + # handle guidance + if self.transformer.config.guidance_embeds and guidance_scale is None: + raise ValueError("guidance_scale is required for guidance-distilled model.") + elif self.transformer.config.guidance_embeds: + guidance = mint.full([1], guidance_scale, dtype=ms.float32) + guidance = guidance.expand((latents.shape[0],)) + elif not self.transformer.config.guidance_embeds and guidance_scale is not None: + logger.warning( + f"guidance_scale is passed as {guidance_scale}, but ignored since the model is not guidance-distilled." + ) + guidance = None + elif not self.transformer.config.guidance_embeds and guidance_scale is None: + guidance = None + + if self.attention_kwargs is None: + self._attention_kwargs = {} + + txt_seq_lens = prompt_embeds_mask.sum(dim=1).tolist() if prompt_embeds_mask is not None else None + negative_txt_seq_lens = ( + negative_prompt_embeds_mask.sum(dim=1).tolist() if negative_prompt_embeds_mask is not None else None + ) + + # 6. Denoising loop + self.scheduler.set_begin_index(0) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + self._current_timestep = t + + latent_model_input = latents + if image_latents is not None: + latent_model_input = mint.cat([latents, image_latents], dim=1) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timestep = t.expand((latents.shape[0],)).to(latents.dtype) + noise_pred = self.transformer( + hidden_states=latent_model_input, + timestep=timestep / 1000, + guidance=guidance, + encoder_hidden_states_mask=prompt_embeds_mask, + encoder_hidden_states=prompt_embeds, + img_shapes=img_shapes, + txt_seq_lens=txt_seq_lens, + attention_kwargs=self.attention_kwargs, + return_dict=False, + )[0] + noise_pred = noise_pred[:, : latents.shape[1]] + + if do_true_cfg: + neg_noise_pred = self.transformer( + hidden_states=latent_model_input, + timestep=timestep / 1000, + guidance=guidance, + encoder_hidden_states_mask=negative_prompt_embeds_mask, + encoder_hidden_states=negative_prompt_embeds, + img_shapes=img_shapes, + txt_seq_lens=negative_txt_seq_lens, + attention_kwargs=self.attention_kwargs, + return_dict=False, + )[0] + neg_noise_pred = neg_noise_pred[:, : latents.shape[1]] + comb_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred) + + cond_norm = mint.norm(noise_pred, dim=-1, keepdim=True) + noise_norm = mint.norm(comb_pred, dim=-1, keepdim=True) + noise_pred = comb_pred * (cond_norm / noise_norm) + + # compute the previous noisy sample x_t -> x_t-1 + latents_dtype = latents.dtype + latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + + if latents.dtype != latents_dtype: + latents = latents.to(latents_dtype) + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + + self._current_timestep = None + if output_type == "latent": + image = latents + else: + latents = self._unpack_latents(latents, height, width, self.vae_scale_factor) + latents = latents.to(self.vae.dtype) + latents_mean = ( + ms.tensor(self.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1).to(latents.dtype) + ) + latents_std = 1.0 / ms.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to( + latents.dtype + ) + latents = latents / latents_std + latents_mean + image = self.vae.decode(latents, return_dict=False)[0][:, :, 0] + image = self.image_processor.postprocess(image, output_type=output_type) + + if not return_dict: + return (image,) + + return QwenImagePipelineOutput(images=image) diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py index 035648ec0d..d357878528 100644 --- a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py @@ -530,7 +530,7 @@ def __call__( not greater than `1`). image (`ms.tensor`, `PIL.Image.Image`, `np.ndarray`, `List[ms.tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both - numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list + numpy array and mindspore tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but if passing latents directly it is not encoded again. diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py index de89eaf5b5..b165e210d9 100644 --- a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py @@ -642,7 +642,7 @@ def __call__( not greater than `1`). image (`ms.tensor`, `PIL.Image.Image`, `np.ndarray`, `List[ms.tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both - numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list + numpy array and mindspore tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but if passing latents directly it is not encoded again. @@ -656,8 +656,8 @@ def __call__( mask_image (`ms.tensor`, `PIL.Image.Image`, `np.ndarray`, `List[ms.tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a - single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one - color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B, + single channel (luminance) before use. If it's a numpy array or mindspore tensor, it should contain one + color channel (L) instead of 3, so the expected shape for mindspore tensor would be `(B, 1, H, W)`, `(B, H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W, 1)`, or `(H, W)`. mask_image_latent (`ms.tensor`, `List[ms.tensor]`): From f42f9a63e97c6febfd6e92fff203c9e9ee2f5b8e Mon Sep 17 00:00:00 2001 From: GUOGUO <55723162+Dong1017@users.noreply.github.com> Date: Tue, 28 Oct 2025 14:53:08 +0800 Subject: [PATCH 7/8] feat: QwenImageEditPlus - diffusers merged PR 12357 --- mindone/diffusers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mindone/diffusers/__init__.py b/mindone/diffusers/__init__.py index c460af575f..6e0732c124 100644 --- a/mindone/diffusers/__init__.py +++ b/mindone/diffusers/__init__.py @@ -658,7 +658,7 @@ PixArtSigmaPipeline, QwenImageEditInpaintPipeline, QwenImageEditPipeline, - QwenImageEditPlusPipeline, + QwenImageEditPlusPipeline, QwenImageImg2ImgPipeline, QwenImageInpaintPipeline, QwenImagePipeline, From 9cae3a62bd55f89ce3891d644aa8208f0e3eea25 Mon Sep 17 00:00:00 2001 From: GUOGUO <55723162+Dong1017@users.noreply.github.com> Date: Wed, 29 Oct 2025 08:47:08 +0800 Subject: [PATCH 8/8] feat: ControlNet - diffusers merged PR 12215, 12301 --- mindone/diffusers/__init__.py | 8 + mindone/diffusers/models/__init__.py | 1 + .../diffusers/models/controlnets/__init__.py | 1 + .../controlnets/controlnet_qwenimage.py | 354 +++++++ mindone/diffusers/pipelines/__init__.py | 4 + .../pipeline_qwenimage_controlnet.py | 983 ++++++++++++++++++ .../pipeline_qwenimage_controlnet_inpaint.py | 914 ++++++++++++++++ .../qwenimage/pipeline_qwenimage_edit_plus.py | 6 +- 8 files changed, 2268 insertions(+), 3 deletions(-) create mode 100644 mindone/diffusers/models/controlnets/controlnet_qwenimage.py create mode 100644 mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py create mode 100644 mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet_inpaint.py diff --git a/mindone/diffusers/__init__.py b/mindone/diffusers/__init__.py index 6e0732c124..73437f3563 100644 --- a/mindone/diffusers/__init__.py +++ b/mindone/diffusers/__init__.py @@ -106,6 +106,8 @@ "OmniGenTransformer2DModel", "PixArtTransformer2DModel", "PriorTransformer", + "QwenImageControlNetModel", + "QwenImageMultiControlNetModel", "QwenImageTransformer2DModel", "SanaControlNetModel", "SanaTransformer2DModel", @@ -263,6 +265,8 @@ "PixArtAlphaPipeline", "PixArtSigmaPAGPipeline", "PixArtSigmaPipeline", + "QwenImageControlNetInpaintPipeline", + "QwenImageControlNetPipeline", "QwenImageEditInpaintPipeline", "QwenImageEditPipeline", "QwenImageEditPlusPipeline", @@ -488,6 +492,8 @@ OmniGenTransformer2DModel, PixArtTransformer2DModel, PriorTransformer, + QwenImageControlNetModel, + QwenImageMultiControlNetModel, QwenImageTransformer2DModel, SanaControlNetModel, SanaTransformer2DModel, @@ -656,6 +662,8 @@ PixArtAlphaPipeline, PixArtSigmaPAGPipeline, PixArtSigmaPipeline, + QwenImageControlNetInpaintPipeline, + QwenImageControlNetPipeline, QwenImageEditInpaintPipeline, QwenImageEditPipeline, QwenImageEditPlusPipeline, diff --git a/mindone/diffusers/models/__init__.py b/mindone/diffusers/models/__init__.py index bdddbd05d5..4aa9494305 100644 --- a/mindone/diffusers/models/__init__.py +++ b/mindone/diffusers/models/__init__.py @@ -46,6 +46,7 @@ "HunyuanDiT2DControlNetModel", "HunyuanDiT2DMultiControlNetModel", ], + "controlnets.controlnet_qwenimage": ["QwenImageControlNetModel", "QwenImageMultiControlNetModel"], "controlnets.controlnet_sana": ["SanaControlNetModel"], "controlnets.controlnet_sd3": ["SD3ControlNetModel", "SD3MultiControlNetModel"], "controlnets.controlnet_sparsectrl": ["SparseControlNetModel"], diff --git a/mindone/diffusers/models/controlnets/__init__.py b/mindone/diffusers/models/controlnets/__init__.py index 421b641299..8a0b38770e 100644 --- a/mindone/diffusers/models/controlnets/__init__.py +++ b/mindone/diffusers/models/controlnets/__init__.py @@ -3,6 +3,7 @@ from .controlnet import ControlNetModel, ControlNetOutput from .controlnet_flux import FluxControlNetModel, FluxControlNetOutput, FluxMultiControlNetModel from .controlnet_hunyuan import HunyuanControlNetOutput, HunyuanDiT2DControlNetModel, HunyuanDiT2DMultiControlNetModel +from .controlnet_qwenimage import QwenImageControlNetModel, QwenImageMultiControlNetModel from .controlnet_sana import SanaControlNetModel from .controlnet_sd3 import SD3ControlNetModel, SD3ControlNetOutput, SD3MultiControlNetModel from .controlnet_sparsectrl import SparseControlNetConditioningEmbedding, SparseControlNetModel, SparseControlNetOutput diff --git a/mindone/diffusers/models/controlnets/controlnet_qwenimage.py b/mindone/diffusers/models/controlnets/controlnet_qwenimage.py new file mode 100644 index 0000000000..a80b72055e --- /dev/null +++ b/mindone/diffusers/models/controlnets/controlnet_qwenimage.py @@ -0,0 +1,354 @@ +# Copyright 2025 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved. +# +# This code is adapted from https://github.com/huggingface/diffusers +# with modifications to run diffusers on mindspore. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import mindspore as ms +from mindspore import mint, nn +from mindspore.common.initializer import initializer + +from ...configuration_utils import ConfigMixin, register_to_config +from ...loaders import FromOriginalModelMixin, PeftAdapterMixin +from ...utils import BaseOutput, logging +from ..attention_processor import AttentionProcessor +from ..cache_utils import CacheMixin +from ..controlnets.controlnet import zero_module +from ..modeling_outputs import Transformer2DModelOutput +from ..modeling_utils import ModelMixin +from ..transformers.transformer_qwenimage import ( + QwenEmbedRope, + QwenImageTransformerBlock, + QwenTimestepProjEmbeddings, + RMSNorm, +) + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class QwenImageControlNetOutput(BaseOutput): + controlnet_block_samples: Tuple[ms.tensor] + + +class QwenImageControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin): + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + patch_size: int = 2, + in_channels: int = 64, + out_channels: Optional[int] = 16, + num_layers: int = 60, + attention_head_dim: int = 128, + num_attention_heads: int = 24, + joint_attention_dim: int = 3584, + axes_dims_rope: Tuple[int, int, int] = (16, 56, 56), + extra_condition_channels: int = 0, # for controlnet-inpainting + ): + super().__init__() + self.out_channels = out_channels or in_channels + self.inner_dim = num_attention_heads * attention_head_dim + + self.pos_embed = QwenEmbedRope(theta=10000, axes_dim=list(axes_dims_rope), scale_rope=True) + + self.time_text_embed = QwenTimestepProjEmbeddings(embedding_dim=self.inner_dim) + + self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6) + + self.img_in = mint.nn.Linear(in_channels, self.inner_dim) + self.txt_in = mint.nn.Linear(joint_attention_dim, self.inner_dim) + + self.transformer_blocks = nn.CellList( + [ + QwenImageTransformerBlock( + dim=self.inner_dim, + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + ) + for _ in range(num_layers) + ] + ) + + # controlnet_blocks + controlnet_blocks = [] + for _ in range(len(self.transformer_blocks)): + controlnet_blocks.append(zero_module(mint.nn.Linear(self.inner_dim, self.inner_dim))) + self.controlnet_x_embedder = zero_module(mint.nn.Linear(in_channels + extra_condition_channels, self.inner_dim)) + self.controlnet_blocks = nn.CellList(controlnet_blocks) + + self.gradient_checkpointing = False + + @property + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors + def attn_processors(self): + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors(name: str, module: nn.Cell, processors: Dict[str, AttentionProcessor]): + if hasattr(module, "get_processor"): + processors[f"{name}.processor"] = module.get_processor() + + for sub_name, child in module.name_cells().items(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.name_cells().items(): + fn_recursive_add_processors(name, module, processors) + + return processors + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor + def set_attn_processor(self, processor): + r""" + Sets the attention processor to use to compute attention. + + Parameters: + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: nn.Cell, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.name_cells().items(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.name_cells().items(): + fn_recursive_attn_processor(name, module, processor) + + @classmethod + def from_transformer( + cls, + transformer, + num_layers: int = 5, + attention_head_dim: int = 128, + num_attention_heads: int = 24, + load_weights_from_transformer=True, + extra_condition_channels: int = 0, + ): + config = dict(transformer.config) + config["num_layers"] = num_layers + config["attention_head_dim"] = attention_head_dim + config["num_attention_heads"] = num_attention_heads + config["extra_condition_channels"] = extra_condition_channels + + controlnet = cls.from_config(config) + + if load_weights_from_transformer: + ms.load_param_into_net(controlnet.pos_embed, transformer.pos_embed.parameters_dict()) + ms.load_param_into_net(controlnet.time_text_embed, transformer.time_text_embed.parameters_dict()) + ms.load_param_into_net(controlnet.img_in, transformer.img_in.parameters_dict()) + ms.load_param_into_net(controlnet.txt_in, transformer.txt_in.parameters_dict()) + ms.load_param_into_net( + controlnet.transformer_blocks, transformer.transformer_blocks.parameters_dict(), strict_load=False + ) + + # zero_module + controlnet.controlnet_x_embedder.weight.set_data( + initializer( + "zeros", + controlnet.controlnet_x_embedder.weight.shape, + controlnet.controlnet_x_embedder.weight.dtype, + ) + ) + controlnet.controlnet_x_embedder.bias.set_data( + initializer( + "zeros", controlnet.controlnet_x_embedder.bias.shape, controlnet.controlnet_x_embedder.bias.dtype + ) + ) + + return controlnet + + def construct( + self, + hidden_states: ms.tensor, + controlnet_cond: ms.tensor, + conditioning_scale: float = 1.0, + encoder_hidden_states: ms.tensor = None, + encoder_hidden_states_mask: ms.tensor = None, + timestep: ms.tensor = None, + img_shapes: Optional[List[Tuple[int, int, int]]] = None, + txt_seq_lens: Optional[List[int]] = None, + joint_attention_kwargs: Optional[Dict[str, Any]] = None, + return_dict: bool = True, + ) -> Union[ms.tensor, Transformer2DModelOutput]: + """ + The [`FluxTransformer2DModel`] forward method. + + Args: + hidden_states (`ms.Tensor` of shape `(batch size, channel, height, width)`): + Input `hidden_states`. + controlnet_cond (`ms.Tensor`): + The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. + conditioning_scale (`float`, defaults to `1.0`): + The scale factor for ControlNet outputs. + encoder_hidden_states (`ms.Tensor` of shape `(batch size, sequence_len, embed_dims)`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. + pooled_projections (`ms.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected + from the embeddings of input conditions. + timestep ( `ms.Tensor`): + Used to indicate denoising step. + block_controlnet_hidden_states: (`list` of `ms.Tensor`): + A list of tensors that if specified are added to the residuals of transformer blocks. + joint_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ + if joint_attention_kwargs is not None: + joint_attention_kwargs = joint_attention_kwargs.copy() + + if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None: + logger.warning( + "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective." + ) + hidden_states = self.img_in(hidden_states) + + # add + hidden_states = hidden_states + self.controlnet_x_embedder(controlnet_cond) + + temb = self.time_text_embed(timestep, hidden_states) + + image_rotary_emb = self.pos_embed(img_shapes, txt_seq_lens) + + timestep = timestep.to(hidden_states.dtype) + encoder_hidden_states = self.txt_norm(encoder_hidden_states) + encoder_hidden_states = self.txt_in(encoder_hidden_states) + + block_samples = () + for index_block, block in enumerate(self.transformer_blocks): + encoder_hidden_states, hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + encoder_hidden_states_mask=encoder_hidden_states_mask, + temb=temb, + image_rotary_emb=image_rotary_emb, + joint_attention_kwargs=joint_attention_kwargs, + ) + block_samples = block_samples + (hidden_states,) + + # controlnet block + controlnet_block_samples = () + for block_sample, controlnet_block in zip(block_samples, self.controlnet_blocks): + block_sample = controlnet_block(block_sample) + controlnet_block_samples = controlnet_block_samples + (block_sample,) + + # scaling + controlnet_block_samples = [sample * conditioning_scale for sample in controlnet_block_samples] + controlnet_block_samples = None if len(controlnet_block_samples) == 0 else controlnet_block_samples + + if not return_dict: + return controlnet_block_samples + + return QwenImageControlNetOutput( + controlnet_block_samples=controlnet_block_samples, + ) + + +class QwenImageMultiControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin): + r""" + `QwenImageMultiControlNetModel` wrapper class for Multi-QwenImageControlNetModel + + This module is a wrapper for multiple instances of the `QwenImageControlNetModel`. The `construct()` API is designed + to be compatible with `QwenImageControlNetModel`. + + Args: + controlnets (`List[QwenImageControlNetModel]`): + Provides additional conditioning to the unet during the denoising process. You must set multiple + `QwenImageControlNetModel` as a list. + """ + + def __init__(self, controlnets): + super().__init__() + self.nets = nn.CellList(controlnets) + + def construct( + self, + hidden_states: ms.tensor, + controlnet_cond: List[ms.tensor], + conditioning_scale: List[float], + encoder_hidden_states: ms.tensor = None, + encoder_hidden_states_mask: ms.tensor = None, + timestep: ms.tensor = None, + img_shapes: Optional[List[Tuple[int, int, int]]] = None, + txt_seq_lens: Optional[List[int]] = None, + joint_attention_kwargs: Optional[Dict[str, Any]] = None, + return_dict: bool = True, + ) -> Union[QwenImageControlNetOutput, Tuple]: + # ControlNet-Union with multiple conditions + # only load one ControlNet for saving memories + if len(self.nets) == 1: + controlnet = self.nets[0] + + for i, (image, scale) in enumerate(zip(controlnet_cond, conditioning_scale)): + block_samples = controlnet( + hidden_states=hidden_states, + controlnet_cond=image, + conditioning_scale=scale, + encoder_hidden_states=encoder_hidden_states, + encoder_hidden_states_mask=encoder_hidden_states_mask, + timestep=timestep, + img_shapes=img_shapes, + txt_seq_lens=txt_seq_lens, + joint_attention_kwargs=joint_attention_kwargs, + return_dict=return_dict, + ) + + # merge samples + if i == 0: + control_block_samples = block_samples + else: + if block_samples is not None and control_block_samples is not None: + control_block_samples = [ + control_block_sample + block_sample + for control_block_sample, block_sample in zip(control_block_samples, block_samples) + ] + else: + raise ValueError("QwenImageMultiControlNetModel only supports a single controlnet-union now.") + + return control_block_samples diff --git a/mindone/diffusers/pipelines/__init__.py b/mindone/diffusers/pipelines/__init__.py index 9962c4a5b5..978224b5e0 100644 --- a/mindone/diffusers/pipelines/__init__.py +++ b/mindone/diffusers/pipelines/__init__.py @@ -189,6 +189,8 @@ "QwenImageEditPipeline", "QwenImageEditPlusPipeline", "QwenImageEditInpaintPipeline", + "QwenImageControlNetInpaintPipeline", + "QwenImageControlNetPipeline", ], "sana": ["SanaPipeline", "SanaSprintPipeline", "SanaControlNetPipeline", "SanaSprintImg2ImgPipeline"], "semantic_stable_diffusion": ["SemanticStableDiffusionPipeline"], @@ -424,6 +426,8 @@ from .pipeline_utils import AudioPipelineOutput, DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline from .qwenimage import ( + QwenImageControlNetInpaintPipeline, + QwenImageControlNetPipeline, QwenImageEditInpaintPipeline, QwenImageEditPipeline, QwenImageEditPlusPipeline, diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py new file mode 100644 index 0000000000..7bf917faba --- /dev/null +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py @@ -0,0 +1,983 @@ +# Copyright 2025 Qwen-Image Team, InstantX Team and The HuggingFace Team. All rights reserved. +# +# This code is adapted from https://github.com/huggingface/diffusers +# with modifications to run diffusers on mindspore. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +from transformers import Qwen2Tokenizer + +import mindspore as ms +from mindspore import mint + +from ....transformers import Qwen2_5_VLForConditionalGeneration +from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...loaders import QwenImageLoraLoaderMixin +from ...models import AutoencoderKLQwenImage, QwenImageTransformer2DModel +from ...models.controlnets.controlnet_qwenimage import QwenImageControlNetModel, QwenImageMultiControlNetModel +from ...schedulers import FlowMatchEulerDiscreteScheduler +from ...utils import deprecate, logging +from ...utils.mindspore_utils import randn_tensor +from ..pipeline_utils import DiffusionPipeline +from .pipeline_output import QwenImagePipelineOutput + +XLA_AVAILABLE = False + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import mindspore + >>> from mindone.diffusers.utils import load_image + >>> from mindone.diffusers import QwenImageControlNetModel, QwenImageMultiControlNetModel, QwenImageControlNetPipeline + + >>> # QwenImageControlNetModel + >>> controlnet = QwenImageControlNetModel.from_pretrained( + ... "InstantX/Qwen-Image-ControlNet-Union", mindspore_dtype=mindspore.bfloat16 + ... ) + >>> pipe = QwenImageControlNetPipeline.from_pretrained( + ... "Qwen/Qwen-Image", controlnet=controlnet, mindspore_dtype=mindspore.bfloat16 + ... ) + >>> prompt = ( + "Aesthetics art, traditional asian pagoda, elaborate golden accents, sky blue and white color palette, swirling cloud pattern," + " digital illustration, east asian architecture, ornamental rooftop, intricate detailing on building, cultural representation." + ) + >>> negative_prompt = " " + >>> control_image = load_image( + ... "https://huggingface.co/InstantX/Qwen-Image-ControlNet-Union/resolve/main/conds/canny.png" + ... ) + >>> # Depending on the variant being used, the pipeline call will slightly vary. + >>> # Refer to the pipeline documentation for more details. + >>> image = pipe( + ... prompt, + ... negative_prompt=negative_prompt, + ... control_image=control_image, + ... controlnet_conditioning_scale=1.0, + ... num_inference_steps=30, + ... true_cfg_scale=4.0, + ... )[0][0] + >>> image.save("qwenimage_cn_union.png") + + >>> # QwenImageMultiControlNetModel + >>> controlnet = QwenImageControlNetModel.from_pretrained( + ... "InstantX/Qwen-Image-ControlNet-Union", mindspore_dtype=mindspore.bfloat16 + ... ) + >>> controlnet = QwenImageMultiControlNetModel([controlnet]) + >>> pipe = QwenImageControlNetPipeline.from_pretrained( + ... "Qwen/Qwen-Image", controlnet=controlnet, mindspore_dtype=mindspore.bfloat16 + ... ) + >>> prompt = ( + "Aesthetics art, traditional asian pagoda, elaborate golden accents, sky blue and white color palette, swirling cloud pattern," + " digital illustration, east asian architecture, ornamental rooftop, intricate detailing on building, cultural representation." + ) + >>> negative_prompt = " " + >>> control_image = load_image( + ... "https://huggingface.co/InstantX/Qwen-Image-ControlNet-Union/resolve/main/conds/canny.png" + ... ) + >>> # Depending on the variant being used, the pipeline call will slightly vary. + >>> # Refer to the pipeline documentation for more details. + >>> image = pipe( + ... prompt, + ... negative_prompt=negative_prompt, + ... control_image=[control_image, control_image], + ... controlnet_conditioning_scale=[0.5, 0.5], + ... num_inference_steps=30, + ... true_cfg_scale=4.0, + ... )[0][0] + >>> image.save("qwenimage_cn_union_multi.png") + ``` +""" + + +# Coped from diffusers.pipelines.qwenimage.pipeline_qwenimage.calculate_shift +def calculate_shift( + image_seq_len, + base_seq_len: int = 256, + max_seq_len: int = 4096, + base_shift: float = 0.5, + max_shift: float = 1.15, +): + m = (max_shift - base_shift) / (max_seq_len - base_seq_len) + b = base_shift - m * base_seq_len + mu = image_seq_len * m + b + return mu + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents +def retrieve_latents( + vae, encoder_output: ms.tensor, generator: Optional[np.random.Generator] = None, sample_mode: str = "sample" +): + if sample_mode == "sample": + return vae.diag_gauss_dist.sample(encoder_output, generator=generator) + elif sample_mode == "argmax": + return vae.diag_gauss_dist.mode(encoder_output) + # This brach is not needed because the encoder_output type is ms.tensor as per AutoencoderKLOuput change + # elif hasattr(encoder_output, "latents"): + # return encoder_output.latents + else: + return encoder_output + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + **kwargs, +): + r""" + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + timesteps (`List[int]`, *optional*): + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. + + Returns: + `Tuple[ms.tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +class QwenImageControlNetPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin): + r""" + The QwenImage pipeline for text-to-image generation. + + Args: + transformer ([`QwenImageTransformer2DModel`]): + Conditional Transformer (MMDiT) architecture to denoise the encoded image latents. + scheduler ([`FlowMatchEulerDiscreteScheduler`]): + A scheduler to be used in combination with `transformer` to denoise the encoded image latents. + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`Qwen2.5-VL-7B-Instruct`]): + [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct), specifically the + [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) variant. + tokenizer (`QwenTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer). + """ + + model_cpu_offload_seq = "text_encoder->transformer->vae" + _callback_tensor_inputs = ["latents", "prompt_embeds"] + + def __init__( + self, + scheduler: FlowMatchEulerDiscreteScheduler, + vae: AutoencoderKLQwenImage, + text_encoder: Qwen2_5_VLForConditionalGeneration, + tokenizer: Qwen2Tokenizer, + transformer: QwenImageTransformer2DModel, + controlnet: Union[QwenImageControlNetModel, QwenImageMultiControlNetModel], + ): + super().__init__() + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + transformer=transformer, + scheduler=scheduler, + controlnet=controlnet, + ) + self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8 + # QwenImage latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible + # by the patch size. So the vae scale factor is multiplied by the patch size to account for this + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2) + self.tokenizer_max_length = 1024 + self.prompt_template_encode = ( + "<|im_start|>system\nDescribe the image by detailing the color, shape, size, " + "texture, quantity, text, spatial relationships of the objects and background:" + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + ) + self.prompt_template_encode_start_idx = 34 + self.default_sample_size = 128 + + # Coped from diffusers.pipelines.qwenimage.pipeline_qwenimage.extract_masked_hidden + def _extract_masked_hidden(self, hidden_states: ms.tensor, mask: ms.tensor): + bool_mask = mask.bool() + valid_lengths = bool_mask.sum(dim=1) + selected = hidden_states[bool_mask] + split_result = mint.split(selected, valid_lengths.tolist(), dim=0) + + return split_result + + # Coped from diffusers.pipelines.qwenimage.pipeline_qwenimage.get_qwen_prompt_embeds + def _get_qwen_prompt_embeds( + self, + prompt: Union[str, List[str]] = None, + dtype: Optional[ms.dtype] = None, + ): + dtype = dtype or self.text_encoder.dtype + + prompt = [prompt] if isinstance(prompt, str) else prompt + + template = self.prompt_template_encode + drop_idx = self.prompt_template_encode_start_idx + txt = [template.format(e) for e in prompt] + txt_tokens = self.tokenizer( + txt, max_length=self.tokenizer_max_length + drop_idx, padding=True, truncation=True, return_tensors="np" + ) + encoder_hidden_states = self.text_encoder( + input_ids=ms.tensor(txt_tokens.input_ids), + attention_mask=ms.tensor(txt_tokens.attention_mask), + output_hidden_states=True, + ) + hidden_states = encoder_hidden_states.hidden_states[-1] + split_hidden_states = self._extract_masked_hidden(hidden_states, ms.tensor(txt_tokens.attention_mask)) + split_hidden_states = [e[drop_idx:] for e in split_hidden_states] + attn_mask_list = [mint.ones(e.shape[0], dtype=ms.int64) for e in split_hidden_states] + max_seq_len = max([e.shape[0] for e in split_hidden_states]) + prompt_embeds = mint.stack( + [mint.cat([u, u.new_zeros((max_seq_len - u.shape[0], u.shape[1]))]) for u in split_hidden_states] + ) + encoder_attention_mask = mint.stack( + [mint.cat([u, u.new_zeros((max_seq_len - u.shape[0]))]) for u in attn_mask_list] + ) + + prompt_embeds = prompt_embeds.to(dtype=dtype) + + return prompt_embeds, encoder_attention_mask + + # Coped from diffusers.pipelines.qwenimage.pipeline_qwenimage.encode_prompt + def encode_prompt( + self, + prompt: Union[str, List[str]], + num_images_per_prompt: int = 1, + prompt_embeds: Optional[ms.tensor] = None, + prompt_embeds_mask: Optional[ms.tensor] = None, + max_sequence_length: int = 1024, + ): + r""" + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + num_images_per_prompt (`int`): + number of images that should be generated per prompt + prompt_embeds (`ms.tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + """ + prompt = [prompt] if isinstance(prompt, str) else prompt + batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0] + + if prompt_embeds is None: + prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt) + + prompt_embeds = prompt_embeds[:, :max_sequence_length] + prompt_embeds_mask = prompt_embeds_mask[:, :max_sequence_length] + + _, seq_len, _ = prompt_embeds.shape + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1) + prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len) + + return prompt_embeds, prompt_embeds_mask + + def check_inputs( + self, + prompt, + height, + width, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + prompt_embeds_mask=None, + negative_prompt_embeds_mask=None, + callback_on_step_end_tensor_inputs=None, + max_sequence_length=None, + ): + if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0: + logger.warning( + f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. " + "Dimensions will be resized accordingly" + ) + + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found" + f" {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: " + f"{negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and prompt_embeds_mask is None: + raise ValueError( + "If `prompt_embeds` are provided, `prompt_embeds_mask` also have to be passed. Make sure to generate `prompt_embeds_mask`" + " from the same text encoder that was used to generate `prompt_embeds`." + ) + if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None: + raise ValueError( + "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_mask` also have to be passed. Make sure to generate" + " `negative_prompt_embeds_mask` from the same text encoder that was used to generate `negative_prompt_embeds`." + ) + + if max_sequence_length is not None and max_sequence_length > 1024: + raise ValueError(f"`max_sequence_length` cannot be greater than 1024 but is {max_sequence_length}") + + @staticmethod + # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._pack_latents + def _pack_latents(latents, batch_size, num_channels_latents, height, width): + latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2) + latents = latents.permute(0, 2, 4, 1, 3, 5) + latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4) + + return latents + + @staticmethod + # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._unpack_latents + def _unpack_latents(latents, height, width, vae_scale_factor): + batch_size, num_patches, channels = latents.shape + + # VAE applies 8x compression on images but we must also account for packing which requires + # latent height and width to be divisible by 2. + height = 2 * (int(height) // (vae_scale_factor * 2)) + width = 2 * (int(width) // (vae_scale_factor * 2)) + + latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2) + latents = latents.permute(0, 3, 1, 4, 2, 5) + + latents = latents.reshape(batch_size, channels // (2 * 2), 1, height, width) + + return latents + + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to + compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. + """ + depr_message = ( + f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version." + " Please use `pipe.vae.enable_slicing()`." + ) + deprecate( + "enable_vae_slicing", + "0.40.0", + depr_message, + ) + self.vae.enable_slicing() + + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to + computing decoding in one step. + """ + depr_message = ( + f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version." + " Please use `pipe.vae.disable_slicing()`." + ) + deprecate( + "disable_vae_slicing", + "0.40.0", + depr_message, + ) + self.vae.disable_slicing() + + def enable_vae_tiling(self): + r""" + Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to + compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow + processing larger images. + """ + depr_message = ( + f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version." + " Please use `pipe.vae.enable_tiling()`." + ) + deprecate( + "enable_vae_tiling", + "0.40.0", + depr_message, + ) + self.vae.enable_tiling() + + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to + computing decoding in one step. + """ + depr_message = ( + f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version." + " Please use `pipe.vae.disable_tiling()`." + ) + deprecate( + "disable_vae_tiling", + "0.40.0", + depr_message, + ) + self.vae.disable_tiling() + + # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline.prepare_latents + def prepare_latents( + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): + # VAE applies 8x compression on images but we must also account for packing which requires + # latent height and width to be divisible by 2. + height = 2 * (int(height) // (self.vae_scale_factor * 2)) + width = 2 * (int(width) // (self.vae_scale_factor * 2)) + + shape = (batch_size, 1, num_channels_latents, height, width) + + if latents is not None: + return latents.to(dtype=dtype) + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + latents = randn_tensor(shape, generator=generator, dtype=dtype) + latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width) + + return latents + + # Copied from diffusers.pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet.StableDiffusion3ControlNetPipeline.prepare_image + def prepare_image( + self, + image, + width, + height, + batch_size, + num_images_per_prompt, + dtype, + do_classifier_free_guidance=False, + guess_mode=False, + ): + if isinstance(image, ms.tensor): + pass + else: + image = self.image_processor.preprocess(image, height=height, width=width) + + image_batch_size = image.shape[0] + + if image_batch_size == 1: + repeat_by = batch_size + else: + # image batch size is the same as prompt batch size + repeat_by = num_images_per_prompt + + image = image.repeat_interleave(repeat_by, dim=0) + + image = image.to(dtype=dtype) + + if do_classifier_free_guidance and not guess_mode: + image = mint.cat([image] * 2) + + return image + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def attention_kwargs(self): + return self._attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @property + def current_timestep(self): + return self._current_timestep + + @property + def interrupt(self): + return self._interrupt + + def __call__( + self, + prompt: Union[str, List[str]] = None, + negative_prompt: Union[str, List[str]] = None, + true_cfg_scale: float = 4.0, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + sigmas: Optional[List[float]] = None, + guidance_scale: Optional[float] = None, + control_guidance_start: Union[float, List[float]] = 0.0, + control_guidance_end: Union[float, List[float]] = 1.0, + control_image: PipelineImageInput = None, + controlnet_conditioning_scale: Union[float, List[float]] = 1.0, + num_images_per_prompt: int = 1, + generator: Optional[Union[np.random.Generator, List[np.random.Generator]]] = None, + latents: Optional[ms.tensor] = None, + prompt_embeds: Optional[ms.tensor] = None, + prompt_embeds_mask: Optional[ms.tensor] = None, + negative_prompt_embeds: Optional[ms.tensor] = None, + negative_prompt_embeds_mask: Optional[ms.tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + attention_kwargs: Optional[Dict[str, Any]] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + max_sequence_length: int = 512, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is + not greater than `1`). + true_cfg_scale (`float`, *optional*, defaults to 1.0): + Guidance scale as defined in [Classifier-Free Diffusion + Guidance](https://huggingface.co/papers/2207.12598). `true_cfg_scale` is defined as `w` of equation 2. + of [Imagen Paper](https://huggingface.co/papers/2205.11487). Classifier-free guidance is enabled by + setting `true_cfg_scale > 1` and a provided `negative_prompt`. Higher guidance scale encourages to + generate images that are closely linked to the text `prompt`, usually at the expense of lower image + quality. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. This is set to 1024 by default for the best results. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. This is set to 1024 by default for the best results. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. + guidance_scale (`float`, *optional*, defaults to None): + A guidance scale value for guidance distilled models. Unlike the traditional classifier-free guidance + where the guidance scale is applied during inference through noise prediction rescaling, guidance + distilled models take the guidance scale directly as an input parameter during forward pass. Guidance + scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images + that are closely linked to the text `prompt`, usually at the expense of lower image quality. This + parameter in the pipeline is there to support future guidance-distilled models when they come up. It is + ignored when not using guidance distilled models. To enable traditional classifier-free guidance, + please pass `true_cfg_scale > 1.0` and `negative_prompt` (even an empty negative prompt like " " should + enable classifier-free guidance computations). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`np.random.Generator` or `List[np.random.Generator]`, *optional*): + One or a list of [numpy generator(s)](https://numpy.org/doc/stable/reference/random/generator.html) + to make generation deterministic. + latents (`ms.tensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will be generated by sampling using the supplied random `generator`. + prompt_embeds (`ms.tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`ms.tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.qwenimage.QwenImagePipelineOutput`] instead of a plain tuple. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`. + + Examples: + + Returns: + [`~pipelines.qwenimage.QwenImagePipelineOutput`] or `tuple`: + [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When + returning a tuple, the first element is a list with the generated images. + """ + + height = height or self.default_sample_size * self.vae_scale_factor + width = width or self.default_sample_size * self.vae_scale_factor + + if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list): + control_guidance_start = len(control_guidance_end) * [control_guidance_start] + elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list): + control_guidance_end = len(control_guidance_start) * [control_guidance_end] + elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list): + mult = len(control_image) if isinstance(self.controlnet, QwenImageMultiControlNetModel) else 1 + control_guidance_start, control_guidance_end = ( + mult * [control_guidance_start], + mult * [control_guidance_end], + ) + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + height, + width, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + prompt_embeds_mask=prompt_embeds_mask, + negative_prompt_embeds_mask=negative_prompt_embeds_mask, + callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs, + max_sequence_length=max_sequence_length, + ) + + self._guidance_scale = guidance_scale + self._attention_kwargs = attention_kwargs + self._current_timestep = None + self._interrupt = False + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + has_neg_prompt = negative_prompt is not None or ( + negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None + ) + + if true_cfg_scale > 1 and not has_neg_prompt: + logger.warning( + f"true_cfg_scale is passed as {true_cfg_scale}, but classifier-free guidance is not enabled since no negative_prompt is provided." + ) + elif true_cfg_scale <= 1 and has_neg_prompt: + logger.warning( + " negative_prompt is passed but classifier-free guidance is not enabled since true_cfg_scale <= 1" + ) + + do_true_cfg = true_cfg_scale > 1 and has_neg_prompt + prompt_embeds, prompt_embeds_mask = self.encode_prompt( + prompt=prompt, + prompt_embeds=prompt_embeds, + prompt_embeds_mask=prompt_embeds_mask, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + ) + if do_true_cfg: + negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt( + prompt=negative_prompt, + prompt_embeds=negative_prompt_embeds, + prompt_embeds_mask=negative_prompt_embeds_mask, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + ) + + # 3. Prepare control image + num_channels_latents = self.transformer.config.in_channels // 4 + if isinstance(self.controlnet, QwenImageControlNetModel): + control_image = self.prepare_image( + image=control_image, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + dtype=self.vae.dtype, + ) + height, width = control_image.shape[-2:] + + if control_image.ndim == 4: + control_image = control_image.unsqueeze(2) + + # vae encode + self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) + latents_mean = ms.tensor(self.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1) + latents_std = 1.0 / ms.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1) + + control_image = retrieve_latents(self.vae, self.vae.encode(control_image), generator=generator) + control_image = (control_image - latents_mean) * latents_std + + control_image = control_image.permute(0, 2, 1, 3, 4) + + # pack + control_image = self._pack_latents( + control_image, + batch_size=control_image.shape[0], + num_channels_latents=num_channels_latents, + height=control_image.shape[3], + width=control_image.shape[4], + ).to(dtype=prompt_embeds.dtype) + + else: + if isinstance(self.controlnet, QwenImageMultiControlNetModel): + control_images = [] + for control_image_ in control_image: + control_image_ = self.prepare_image( + image=control_image_, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + dtype=self.vae.dtype, + ) + + height, width = control_image_.shape[-2:] + + if control_image_.ndim == 4: + control_image_ = control_image_.unsqueeze(2) + + # vae encode + self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) + latents_mean = ms.tensor(self.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1) + latents_std = 1.0 / ms.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1) + + control_image_ = retrieve_latents(self.vae, self.vae.encode(control_image_), generator=generator) + control_image_ = (control_image_ - latents_mean) * latents_std + + control_image_ = control_image_.permute(0, 2, 1, 3, 4) + + # pack + control_image_ = self._pack_latents( + control_image_, + batch_size=control_image_.shape[0], + num_channels_latents=num_channels_latents, + height=control_image_.shape[3], + width=control_image_.shape[4], + ).to(dtype=prompt_embeds.dtype) + + control_images.append(control_image_) + + control_image = control_images + + # 4. Prepare latent variables + num_channels_latents = self.transformer.config.in_channels // 4 + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + generator, + latents, + ) + img_shapes = [(1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2)] * batch_size + + # 5. Prepare timesteps + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas + image_seq_len = latents.shape[1] + mu = calculate_shift( + image_seq_len, + self.scheduler.config.get("base_image_seq_len", 256), + self.scheduler.config.get("max_image_seq_len", 4096), + self.scheduler.config.get("base_shift", 0.5), + self.scheduler.config.get("max_shift", 1.15), + ) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, + num_inference_steps, + sigmas=sigmas, + mu=mu, + ) + num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) + self._num_timesteps = len(timesteps) + + controlnet_keep = [] + for i in range(len(timesteps)): + keeps = [ + 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e) + for s, e in zip(control_guidance_start, control_guidance_end) + ] + controlnet_keep.append(keeps[0] if isinstance(self.controlnet, QwenImageControlNetModel) else keeps) + + # handle guidance + if self.transformer.config.guidance_embeds and guidance_scale is None: + raise ValueError("guidance_scale is required for guidance-distilled model.") + elif self.transformer.config.guidance_embeds: + guidance = mint.full([1], guidance_scale, dtype=ms.float32) + guidance = guidance.expand((latents.shape[0],)) + elif not self.transformer.config.guidance_embeds and guidance_scale is not None: + logger.warning( + f"guidance_scale is passed as {guidance_scale}, but ignored since the model is not guidance-distilled." + ) + guidance = None + elif not self.transformer.config.guidance_embeds and guidance_scale is None: + guidance = None + + if self.attention_kwargs is None: + self._attention_kwargs = {} + + # 6. Denoising loop + self.scheduler.set_begin_index(0) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + self._current_timestep = t + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timestep = t.expand((latents.shape[0],)).to(latents.dtype) + + if isinstance(controlnet_keep[i], list): + cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])] + else: + controlnet_cond_scale = controlnet_conditioning_scale + if isinstance(controlnet_cond_scale, list): + controlnet_cond_scale = controlnet_cond_scale[0] + cond_scale = controlnet_cond_scale * controlnet_keep[i] + + # controlnet + controlnet_block_samples = self.controlnet( + hidden_states=latents, + controlnet_cond=control_image, + conditioning_scale=cond_scale, + timestep=timestep / 1000, + encoder_hidden_states=prompt_embeds, + encoder_hidden_states_mask=prompt_embeds_mask, + img_shapes=img_shapes, + txt_seq_lens=prompt_embeds_mask.sum(dim=1).tolist(), + return_dict=False, + ) + + noise_pred = self.transformer( + hidden_states=latents, + timestep=timestep / 1000, + encoder_hidden_states=prompt_embeds, + encoder_hidden_states_mask=prompt_embeds_mask, + img_shapes=img_shapes, + txt_seq_lens=prompt_embeds_mask.sum(dim=1).tolist(), + controlnet_block_samples=controlnet_block_samples, + attention_kwargs=self.attention_kwargs, + return_dict=False, + )[0] + + if do_true_cfg: + neg_noise_pred = self.transformer( + hidden_states=latents, + timestep=timestep / 1000, + guidance=guidance, + encoder_hidden_states_mask=negative_prompt_embeds_mask, + encoder_hidden_states=negative_prompt_embeds, + img_shapes=img_shapes, + txt_seq_lens=negative_prompt_embeds_mask.sum(dim=1).tolist(), + controlnet_block_samples=controlnet_block_samples, + attention_kwargs=self.attention_kwargs, + return_dict=False, + )[0] + comb_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred) + + cond_norm = mint.norm(noise_pred, dim=-1, keepdim=True) + noise_norm = mint.norm(comb_pred, dim=-1, keepdim=True) + noise_pred = comb_pred * (cond_norm / noise_norm) + + # compute the previous noisy sample x_t -> x_t-1 + latents_dtype = latents.dtype + latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + + if latents.dtype != latents_dtype: + latents = latents.to(latents_dtype) + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + + self._current_timestep = None + if output_type == "latent": + image = latents + else: + latents = self._unpack_latents(latents, height, width, self.vae_scale_factor) + latents = latents.to(self.vae.dtype) + latents_mean = ( + ms.tensor(self.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1).to(latents.dtype) + ) + latents_std = 1.0 / ms.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to( + latents.dtype + ) + latents = latents / latents_std + latents_mean + image = self.vae.decode(latents, return_dict=False)[0][:, :, 0] + image = self.image_processor.postprocess(image, output_type=output_type) + + if not return_dict: + return (image,) + + return QwenImagePipelineOutput(images=image) diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet_inpaint.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet_inpaint.py new file mode 100644 index 0000000000..b91d7866e1 --- /dev/null +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet_inpaint.py @@ -0,0 +1,914 @@ +# Copyright 2025 Qwen-Image Team, The InstantX Team and The HuggingFace Team. All rights reserved. +# +# This code is adapted from https://github.com/huggingface/diffusers +# with modifications to run diffusers on mindspore. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +from transformers import Qwen2Tokenizer + +import mindspore as ms +from mindspore import mint + +from ....transformers import Qwen2_5_VLForConditionalGeneration +from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...loaders import QwenImageLoraLoaderMixin +from ...models import AutoencoderKLQwenImage, QwenImageTransformer2DModel +from ...models.controlnets.controlnet_qwenimage import QwenImageControlNetModel, QwenImageMultiControlNetModel +from ...schedulers import FlowMatchEulerDiscreteScheduler +from ...utils import logging +from ...utils.mindspore_utils import randn_tensor +from ..pipeline_utils import DiffusionPipeline +from .pipeline_output import QwenImagePipelineOutput + +XLA_AVAILABLE = False + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import mindspore + >>> from mindone.diffusers.utils import load_image + >>> from mindone.diffusers import QwenImageControlNetModel, QwenImageControlNetInpaintPipeline + + >>> base_model_path = "Qwen/Qwen-Image" + >>> controlnet_model_path = "InstantX/Qwen-Image-ControlNet-Inpainting" + >>> controlnet = QwenImageControlNetModel.from_pretrained(controlnet_model_path, mindspore_dtype=mindspore.bfloat16) + >>> pipe = QwenImageControlNetInpaintPipeline.from_pretrained( + ... base_model_path, controlnet=controlnet, mindspore_dtype=mindspore.bfloat16 + ... ) + >>> image = load_image( + ... "https://huggingface.co/InstantX/Qwen-Image-ControlNet-Inpainting/resolve/main/assets/images/image1.png" + ... ) + >>> mask_image = load_image( + ... "https://huggingface.co/InstantX/Qwen-Image-ControlNet-Inpainting/resolve/main/assets/masks/mask1.png" + ... ) + >>> prompt = "一辆绿色的出租车行驶在路上" + >>> result = pipe( + ... prompt=prompt, + ... control_image=image, + ... control_mask=mask_image, + ... controlnet_conditioning_scale=1.0, + ... width=mask_image.size[0], + ... height=mask_image.size[1], + ... true_cfg_scale=4.0, + ... )[0][0] + >>> image.save("qwenimage_controlnet_inpaint.png") + ``` +""" + + +# Coped from diffusers.pipelines.qwenimage.pipeline_qwenimage.calculate_shift +def calculate_shift( + image_seq_len, + base_seq_len: int = 256, + max_seq_len: int = 4096, + base_shift: float = 0.5, + max_shift: float = 1.15, +): + m = (max_shift - base_shift) / (max_seq_len - base_seq_len) + b = base_shift - m * base_seq_len + mu = image_seq_len * m + b + return mu + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents +def retrieve_latents( + vae, encoder_output: ms.tensor, generator: Optional[np.random.Generator] = None, sample_mode: str = "sample" +): + if sample_mode == "sample": + return vae.diag_gauss_dist.sample(encoder_output, generator=generator) + elif sample_mode == "argmax": + return vae.diag_gauss_dist.mode(encoder_output) + # This brach is not needed because the encoder_output type is ms.tensor as per AutoencoderKLOuput change + # elif hasattr(encoder_output, "latents"): + # return encoder_output.latents + else: + return encoder_output + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + **kwargs, +): + r""" + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + timesteps (`List[int]`, *optional*): + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. + + Returns: + `Tuple[ms.tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +class QwenImageControlNetInpaintPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin): + r""" + The QwenImage pipeline for text-to-image generation. + + Args: + transformer ([`QwenImageTransformer2DModel`]): + Conditional Transformer (MMDiT) architecture to denoise the encoded image latents. + scheduler ([`FlowMatchEulerDiscreteScheduler`]): + A scheduler to be used in combination with `transformer` to denoise the encoded image latents. + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`Qwen2.5-VL-7B-Instruct`]): + [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct), specifically the + [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) variant. + tokenizer (`QwenTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer). + """ + + model_cpu_offload_seq = "text_encoder->transformer->vae" + _callback_tensor_inputs = ["latents", "prompt_embeds"] + + def __init__( + self, + scheduler: FlowMatchEulerDiscreteScheduler, + vae: AutoencoderKLQwenImage, + text_encoder: Qwen2_5_VLForConditionalGeneration, + tokenizer: Qwen2Tokenizer, + transformer: QwenImageTransformer2DModel, + controlnet: QwenImageControlNetModel, + ): + super().__init__() + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + transformer=transformer, + scheduler=scheduler, + controlnet=controlnet, + ) + self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8 + # QwenImage latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible + # by the patch size. So the vae scale factor is multiplied by the patch size to account for this + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2) + + self.mask_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor * 2, + do_resize=True, + do_convert_grayscale=True, + do_normalize=False, + do_binarize=True, + ) + + self.tokenizer_max_length = 1024 + self.prompt_template_encode = ( + "<|im_start|>system\nDescribe the image by detailing the color, shape, size, " + "texture, quantity, text, spatial relationships of the objects and background:" + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + ) + self.prompt_template_encode_start_idx = 34 + self.default_sample_size = 128 + + # Coped from diffusers.pipelines.qwenimage.pipeline_qwenimage.extract_masked_hidden + def _extract_masked_hidden(self, hidden_states: ms.tensor, mask: ms.tensor): + bool_mask = mask.bool() + valid_lengths = bool_mask.sum(dim=1) + selected = hidden_states[bool_mask] + split_result = mint.split(selected, valid_lengths.tolist(), dim=0) + + return split_result + + # Coped from diffusers.pipelines.qwenimage.pipeline_qwenimage.get_qwen_prompt_embeds + def _get_qwen_prompt_embeds( + self, + prompt: Union[str, List[str]] = None, + dtype: Optional[ms.dtype] = None, + ): + dtype = dtype or self.text_encoder.dtype + + prompt = [prompt] if isinstance(prompt, str) else prompt + + template = self.prompt_template_encode + drop_idx = self.prompt_template_encode_start_idx + txt = [template.format(e) for e in prompt] + txt_tokens = self.tokenizer( + txt, max_length=self.tokenizer_max_length + drop_idx, padding=True, truncation=True, return_tensors="np" + ) + encoder_hidden_states = self.text_encoder( + input_ids=ms.tensor(txt_tokens.input_ids), + attention_mask=ms.tensor(txt_tokens.attention_mask), + output_hidden_states=True, + ) + hidden_states = encoder_hidden_states.hidden_states[-1] + split_hidden_states = self._extract_masked_hidden(hidden_states, ms.tensor(txt_tokens.attention_mask)) + split_hidden_states = [e[drop_idx:] for e in split_hidden_states] + attn_mask_list = [mint.ones(e.shape[0], dtype=ms.int64) for e in split_hidden_states] + max_seq_len = max([e.shape[0] for e in split_hidden_states]) + prompt_embeds = mint.stack( + [mint.cat([u, u.new_zeros((max_seq_len - u.shape[0], u.shape[1]))]) for u in split_hidden_states] + ) + encoder_attention_mask = mint.stack( + [mint.cat([u, u.new_zeros((max_seq_len - u.shape[0]))]) for u in attn_mask_list] + ) + + prompt_embeds = prompt_embeds.to(dtype=dtype) + + return prompt_embeds, encoder_attention_mask + + # Coped from diffusers.pipelines.qwenimage.pipeline_qwenimage.encode_prompt + def encode_prompt( + self, + prompt: Union[str, List[str]], + num_images_per_prompt: int = 1, + prompt_embeds: Optional[ms.tensor] = None, + prompt_embeds_mask: Optional[ms.tensor] = None, + max_sequence_length: int = 1024, + ): + r""" + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + num_images_per_prompt (`int`): + number of images that should be generated per prompt + prompt_embeds (`ms.tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + """ + prompt = [prompt] if isinstance(prompt, str) else prompt + batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0] + + if prompt_embeds is None: + prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt) + + prompt_embeds = prompt_embeds[:, :max_sequence_length] + prompt_embeds_mask = prompt_embeds_mask[:, :max_sequence_length] + + _, seq_len, _ = prompt_embeds.shape + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1) + prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len) + + return prompt_embeds, prompt_embeds_mask + + def check_inputs( + self, + prompt, + height, + width, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + prompt_embeds_mask=None, + negative_prompt_embeds_mask=None, + callback_on_step_end_tensor_inputs=None, + max_sequence_length=None, + ): + if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0: + logger.warning( + f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly" + ) + + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found" + f" {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. " + "Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and prompt_embeds_mask is None: + raise ValueError( + "If `prompt_embeds` are provided, `prompt_embeds_mask` also have to be passed. Make sure to generate `prompt_embeds_mask`" + " from the same text encoder that was used to generate `prompt_embeds`." + ) + if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None: + raise ValueError( + "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_mask` also have to be passed. Make sure to generate" + " `negative_prompt_embeds_mask` from the same text encoder that was used to generate `negative_prompt_embeds`." + ) + + if max_sequence_length is not None and max_sequence_length > 1024: + raise ValueError(f"`max_sequence_length` cannot be greater than 1024 but is {max_sequence_length}") + + @staticmethod + # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._pack_latents + def _pack_latents(latents, batch_size, num_channels_latents, height, width): + latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2) + latents = latents.permute(0, 2, 4, 1, 3, 5) + latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4) + + return latents + + @staticmethod + # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._unpack_latents + def _unpack_latents(latents, height, width, vae_scale_factor): + batch_size, num_patches, channels = latents.shape + + # VAE applies 8x compression on images but we must also account for packing which requires + # latent height and width to be divisible by 2. + height = 2 * (int(height) // (vae_scale_factor * 2)) + width = 2 * (int(width) // (vae_scale_factor * 2)) + + latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2) + latents = latents.permute(0, 3, 1, 4, 2, 5) + + latents = latents.reshape(batch_size, channels // (2 * 2), 1, height, width) + + return latents + + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to + compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() + + def enable_vae_tiling(self): + r""" + Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to + compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow + processing larger images. + """ + self.vae.enable_tiling() + + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to + computing decoding in one step. + """ + self.vae.disable_tiling() + + # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline.prepare_latents + def prepare_latents( + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): + # VAE applies 8x compression on images but we must also account for packing which requires + # latent height and width to be divisible by 2. + height = 2 * (int(height) // (self.vae_scale_factor * 2)) + width = 2 * (int(width) // (self.vae_scale_factor * 2)) + + shape = (batch_size, 1, num_channels_latents, height, width) + + if latents is not None: + return latents.to(dtype=dtype) + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + latents = randn_tensor(shape, generator=generator, dtype=dtype) + latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width) + + return latents + + # Copied from diffusers.pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet.StableDiffusion3ControlNetPipeline.prepare_image + def prepare_image( + self, + image, + width, + height, + batch_size, + num_images_per_prompt, + dtype, + do_classifier_free_guidance=False, + guess_mode=False, + ): + if isinstance(image, ms.tensor): + pass + else: + image = self.image_processor.preprocess(image, height=height, width=width) + + image_batch_size = image.shape[0] + + if image_batch_size == 1: + repeat_by = batch_size + else: + # image batch size is the same as prompt batch size + repeat_by = num_images_per_prompt + + image = image.repeat_interleave(repeat_by, dim=0) + + image = image.to(dtype=dtype) + + if do_classifier_free_guidance and not guess_mode: + image = mint.cat([image] * 2) + + return image + + def prepare_image_with_mask( + self, + image, + mask, + width, + height, + batch_size, + num_images_per_prompt, + dtype, + do_classifier_free_guidance=False, + guess_mode=False, + ): + if isinstance(image, ms.Tensor): + pass + else: + image = self.image_processor.preprocess(image, height=height, width=width) + + image_batch_size = image.shape[0] + + if image_batch_size == 1: + repeat_by = batch_size + else: + # image batch size is the same as prompt batch size + repeat_by = num_images_per_prompt + + image = image.repeat_interleave(repeat_by, dim=0) + image = image.to(dtype=dtype) # (bsz, 3, height_ori, width_ori) + + # Prepare mask + if isinstance(mask, ms.Tensor): + pass + else: + mask = self.mask_processor.preprocess(mask, height=height, width=width) + mask = mask.repeat_interleave(repeat_by, dim=0) + mask = mask.to(dtype=dtype) # (bsz, 1, height_ori, width_ori) + + if image.ndim == 4: + image = image.unsqueeze(2) + + if mask.ndim == 4: + mask = mask.unsqueeze(2) + + # Get masked image + masked_image = image.clone() + masked_image[(mask > 0.5).repeat(1, 3, 1, 1, 1)] = -1 # (bsz, 3, 1, height_ori, width_ori) + + self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) + latents_mean = ms.tensor(self.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1) + latents_std = 1.0 / ms.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1) + + # Encode to latents + image_latents = self.vae.encode(masked_image.to(self.vae.dtype)).latent_dist.sample() + image_latents = (image_latents - latents_mean) * latents_std + image_latents = image_latents.to(dtype) # Size([1, 16, 1, height_ori//8, width_ori//8]) + + mask = mint.nn.functional.interpolate( + mask, size=(image_latents.shape[-3], image_latents.shape[-2], image_latents.shape[-1]) + ) + mask = 1 - mask # Size([1, 1, 1, height_ori//8, width_ori//8]) + + control_image = mint.cat([image_latents, mask], dim=1) # Size([1, 16+1, 1, height_ori//8, width_ori//8]) + + control_image = control_image.permute(0, 2, 1, 3, 4) # Size([1, 1, 16+1, height_ori//8, width_ori//8]) + + # pack + control_image = self._pack_latents( + control_image, + batch_size=control_image.shape[0], + num_channels_latents=control_image.shape[2], + height=control_image.shape[3], + width=control_image.shape[4], + ) + + if do_classifier_free_guidance and not guess_mode: + control_image = mint.cat([control_image] * 2) + + return control_image + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def attention_kwargs(self): + return self._attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @property + def current_timestep(self): + return self._current_timestep + + @property + def interrupt(self): + return self._interrupt + + def __call__( + self, + prompt: Union[str, List[str]] = None, + negative_prompt: Union[str, List[str]] = None, + true_cfg_scale: float = 4.0, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + sigmas: Optional[List[float]] = None, + guidance_scale: float = 1.0, + control_guidance_start: Union[float, List[float]] = 0.0, + control_guidance_end: Union[float, List[float]] = 1.0, + control_image: PipelineImageInput = None, + control_mask: PipelineImageInput = None, + controlnet_conditioning_scale: Union[float, List[float]] = 1.0, + num_images_per_prompt: int = 1, + generator: Optional[Union[np.random.Generator, List[np.random.Generator]]] = None, + latents: Optional[ms.tensor] = None, + prompt_embeds: Optional[ms.tensor] = None, + prompt_embeds_mask: Optional[ms.tensor] = None, + negative_prompt_embeds: Optional[ms.tensor] = None, + negative_prompt_embeds_mask: Optional[ms.tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + attention_kwargs: Optional[Dict[str, Any]] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + max_sequence_length: int = 512, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is + not greater than `1`). + true_cfg_scale (`float`, *optional*, defaults to 1.0): + When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. This is set to 1024 by default for the best results. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. This is set to 1024 by default for the best results. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. + guidance_scale (`float`, *optional*, defaults to 3.5): + Guidance scale as defined in [Classifier-Free Diffusion + Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2. + of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting + `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to + the text `prompt`, usually at the expense of lower image quality. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`np.random.Generator` or `List[np.random.Generator]`, *optional*): + One or a list of [numpy generator(s)](https://numpy.org/doc/stable/reference/random/generator.html) + to make generation deterministic. + latents (`ms.tensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will be generated by sampling using the supplied random `generator`. + prompt_embeds (`ms.tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`ms.tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.qwenimage.QwenImagePipelineOutput`] instead of a plain tuple. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`. + + Examples: + + Returns: + [`~pipelines.qwenimage.QwenImagePipelineOutput`] or `tuple`: + [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When + returning a tuple, the first element is a list with the generated images. + """ + + height = height or self.default_sample_size * self.vae_scale_factor + width = width or self.default_sample_size * self.vae_scale_factor + + if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list): + control_guidance_start = len(control_guidance_end) * [control_guidance_start] + elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list): + control_guidance_end = len(control_guidance_start) * [control_guidance_end] + elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list): + mult = len(control_image) if isinstance(self.controlnet, QwenImageMultiControlNetModel) else 1 + control_guidance_start, control_guidance_end = ( + mult * [control_guidance_start], + mult * [control_guidance_end], + ) + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + height, + width, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + prompt_embeds_mask=prompt_embeds_mask, + negative_prompt_embeds_mask=negative_prompt_embeds_mask, + callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs, + max_sequence_length=max_sequence_length, + ) + + self._guidance_scale = guidance_scale + self._attention_kwargs = attention_kwargs + self._current_timestep = None + self._interrupt = False + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + has_neg_prompt = negative_prompt is not None or ( + negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None + ) + do_true_cfg = true_cfg_scale > 1 and has_neg_prompt + prompt_embeds, prompt_embeds_mask = self.encode_prompt( + prompt=prompt, + prompt_embeds=prompt_embeds, + prompt_embeds_mask=prompt_embeds_mask, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + ) + if do_true_cfg: + negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt( + prompt=negative_prompt, + prompt_embeds=negative_prompt_embeds, + prompt_embeds_mask=negative_prompt_embeds_mask, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + ) + + # 3. Prepare control image + num_channels_latents = self.transformer.config.in_channels // 4 + if isinstance(self.controlnet, QwenImageControlNetModel): + control_image = self.prepare_image_with_mask( + image=control_image, + mask=control_mask, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + dtype=self.vae.dtype, + ) + + # 4. Prepare latent variables + num_channels_latents = self.transformer.config.in_channels // 4 + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + generator, + latents, + ) + img_shapes = [(1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2)] * batch_size + + # 5. Prepare timesteps + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas + image_seq_len = latents.shape[1] + mu = calculate_shift( + image_seq_len, + self.scheduler.config.get("base_image_seq_len", 256), + self.scheduler.config.get("max_image_seq_len", 4096), + self.scheduler.config.get("base_shift", 0.5), + self.scheduler.config.get("max_shift", 1.15), + ) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, + num_inference_steps, + sigmas=sigmas, + mu=mu, + ) + num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) + self._num_timesteps = len(timesteps) + + controlnet_keep = [] + for i in range(len(timesteps)): + keeps = [ + 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e) + for s, e in zip(control_guidance_start, control_guidance_end) + ] + controlnet_keep.append(keeps[0] if isinstance(self.controlnet, QwenImageControlNetModel) else keeps) + + # handle guidance + if self.transformer.config.guidance_embeds: + guidance = mint.full([1], guidance_scale, dtype=ms.float32) + guidance = guidance.expand((latents.shape[0],)) + else: + guidance = None + + if self.attention_kwargs is None: + self._attention_kwargs = {} + + # 6. Denoising loop + self.scheduler.set_begin_index(0) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + self._current_timestep = t + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timestep = t.expand((latents.shape[0],)).to(latents.dtype) + + if isinstance(controlnet_keep[i], list): + cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])] + else: + controlnet_cond_scale = controlnet_conditioning_scale + if isinstance(controlnet_cond_scale, list): + controlnet_cond_scale = controlnet_cond_scale[0] + cond_scale = controlnet_cond_scale * controlnet_keep[i] + + # controlnet + controlnet_block_samples = self.controlnet( + hidden_states=latents, + controlnet_cond=control_image.to(dtype=latents.dtype), + conditioning_scale=cond_scale, + timestep=timestep / 1000, + encoder_hidden_states=prompt_embeds, + encoder_hidden_states_mask=prompt_embeds_mask, + img_shapes=img_shapes, + txt_seq_lens=prompt_embeds_mask.sum(dim=1).tolist(), + return_dict=False, + ) + + noise_pred = self.transformer( + hidden_states=latents, + timestep=timestep / 1000, + encoder_hidden_states=prompt_embeds, + encoder_hidden_states_mask=prompt_embeds_mask, + img_shapes=img_shapes, + txt_seq_lens=prompt_embeds_mask.sum(dim=1).tolist(), + controlnet_block_samples=controlnet_block_samples, + attention_kwargs=self.attention_kwargs, + return_dict=False, + )[0] + + if do_true_cfg: + neg_noise_pred = self.transformer( + hidden_states=latents, + timestep=timestep / 1000, + guidance=guidance, + encoder_hidden_states_mask=negative_prompt_embeds_mask, + encoder_hidden_states=negative_prompt_embeds, + img_shapes=img_shapes, + txt_seq_lens=negative_prompt_embeds_mask.sum(dim=1).tolist(), + controlnet_block_samples=controlnet_block_samples, + attention_kwargs=self.attention_kwargs, + return_dict=False, + )[0] + comb_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred) + + cond_norm = mint.norm(noise_pred, dim=-1, keepdim=True) + noise_norm = mint.norm(comb_pred, dim=-1, keepdim=True) + noise_pred = comb_pred * (cond_norm / noise_norm) + + # compute the previous noisy sample x_t -> x_t-1 + latents_dtype = latents.dtype + latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + + if latents.dtype != latents_dtype: + latents = latents.to(latents_dtype) + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + + self._current_timestep = None + if output_type == "latent": + image = latents + else: + latents = self._unpack_latents(latents, height, width, self.vae_scale_factor) + latents = latents.to(self.vae.dtype) + latents_mean = ( + ms.tensor(self.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1).to(latents.dtype) + ) + latents_std = 1.0 / ms.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to( + latents.dtype + ) + latents = latents / latents_std + latents_mean + image = self.vae.decode(latents, return_dict=False)[0][:, :, 0] + image = self.image_processor.postprocess(image, output_type=output_type) + + if not return_dict: + return (image,) + + return QwenImagePipelineOutput(images=image) diff --git a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py index cab0b1d483..00f87fe0e9 100644 --- a/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py +++ b/mindone/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py @@ -361,8 +361,8 @@ def check_inputs( if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. " + "Please make sure to only forward one of the two." ) if prompt_embeds is not None and prompt_embeds_mask is None: @@ -416,7 +416,7 @@ def _encode_vae_image(self, image: ms.tensor, generator: np.random.Generator): image_latents = mint.cat(image_latents, dim=0) else: image_latents = retrieve_latents(self.vae, self.vae.encode(image)[0], sample_mode="argmax") - + latents_mean = ( ms.tensor(self.vae.config.latents_mean).view(1, self.latent_channels, 1, 1, 1).to(image_latents.dtype) )