@@ -635,6 +635,8 @@ def apply_edit_conditioning(
635635 input_latent : Output ,
636636 control_layers : list [Control ],
637637 vae : Output ,
638+ clip : Output ,
639+ positive : str ,
638640 arch : Arch ,
639641 tiled_vae : bool ,
640642):
@@ -643,10 +645,16 @@ def apply_edit_conditioning(
643645
644646 extra_input = [c .image for c in control_layers if c .mode .is_ip_adapter ]
645647 if len (extra_input ) == 0 :
648+ if arch == Arch .qwen_e :
649+ # Don't use VAE to force the reference latent
650+ cond = w .text_encode_qwen_image_edit (clip , None , input_image , positive )
646651 return w .reference_latent (cond , input_latent )
647652
648653 input = w .image_stitch ([input_image ] + [i .load (w ) for i in extra_input ])
649654 latent = vae_encode (w , vae , input , tiled_vae )
655+ if arch == Arch .qwen_e :
656+ # Don't use VAE to force the reference latent
657+ cond = w .text_encode_qwen_image_edit (clip , None , input , positive )
650658 cond = w .reference_latent (cond , latent )
651659 return cond
652660
@@ -734,7 +742,7 @@ def scale_refine_and_decode(
734742 model , positive , negative = apply_control (
735743 w , model , positive , negative , cond .all_control , extent .desired , vae , models
736744 )
737- positive = apply_edit_conditioning (w , positive , upscale , latent , [], vae , arch , tiled_vae )
745+ positive = apply_edit_conditioning (w , positive , upscale , latent , [], vae , clip . model , cond . positive . text , arch , tiled_vae )
738746 result = w .sampler_custom_advanced (model , positive , negative , latent , arch , ** params )
739747 image = vae_decode (w , vae , result , tiled_vae )
740748 return image
@@ -1015,7 +1023,7 @@ def refine(
10151023 w , model , positive , negative , cond .all_control , extent .desired , vae , models
10161024 )
10171025 positive = apply_edit_conditioning (
1018- w , positive , in_image , latent , cond .all_control , vae , models .arch , checkpoint .tiled_vae
1026+ w , positive , in_image , latent , cond .all_control , vae , clip . model , cond . positive . text , models .arch , checkpoint .tiled_vae
10191027 )
10201028 sampler = w .sampler_custom_advanced (
10211029 model , positive , negative , latent_batch , models .arch , ** _sampler_params (sampling )
@@ -1067,7 +1075,7 @@ def refine_region(
10671075 else :
10681076 latent = vae_encode (w , vae , in_image , checkpoint .tiled_vae )
10691077 positive = apply_edit_conditioning (
1070- w , positive , in_image , latent , cond .all_control , vae , models .arch , checkpoint .tiled_vae
1078+ w , positive , in_image , latent , cond .all_control , vae , clip . model , cond . positive . text , models .arch , checkpoint .tiled_vae
10711079 )
10721080 latent = w .set_latent_noise_mask (latent , initial_mask )
10731081 inpaint_model = model
0 commit comments