50
50
from invokeai .app .invocations .t2i_adapter import T2IAdapterField
51
51
from invokeai .app .services .shared .invocation_context import InvocationContext
52
52
from invokeai .app .util .controlnet_utils import prepare_control_image
53
- from invokeai .backend .ip_adapter .ip_adapter import IPAdapter , IPAdapterPlus
53
+ from invokeai .backend .ip_adapter .ip_adapter import IPAdapter
54
54
from invokeai .backend .lora import LoRAModelRaw
55
55
from invokeai .backend .model_manager import BaseModelType , LoadedModel
56
56
from invokeai .backend .model_manager .config import MainConfigBase , ModelVariantType
@@ -672,54 +672,52 @@ def prep_control_data(
672
672
673
673
return controlnet_data
674
674
675
+ def prep_ip_adapter_image_prompts (
676
+ self ,
677
+ context : InvocationContext ,
678
+ ip_adapters : List [IPAdapterField ],
679
+ ) -> List [Tuple [torch .Tensor , torch .Tensor ]]:
680
+ """Run the IPAdapter CLIPVisionModel, returning image prompt embeddings."""
681
+ image_prompts = []
682
+ for single_ip_adapter in ip_adapters :
683
+ with context .models .load (single_ip_adapter .ip_adapter_model ) as ip_adapter_model :
684
+ assert isinstance (ip_adapter_model , IPAdapter )
685
+ image_encoder_model_info = context .models .load (single_ip_adapter .image_encoder_model )
686
+ # `single_ip_adapter.image` could be a list or a single ImageField. Normalize to a list here.
687
+ single_ipa_image_fields = single_ip_adapter .image
688
+ if not isinstance (single_ipa_image_fields , list ):
689
+ single_ipa_image_fields = [single_ipa_image_fields ]
690
+
691
+ single_ipa_images = [context .images .get_pil (image .image_name ) for image in single_ipa_image_fields ]
692
+ with image_encoder_model_info as image_encoder_model :
693
+ assert isinstance (image_encoder_model , CLIPVisionModelWithProjection )
694
+ # Get image embeddings from CLIP and ImageProjModel.
695
+ image_prompt_embeds , uncond_image_prompt_embeds = ip_adapter_model .get_image_embeds (
696
+ single_ipa_images , image_encoder_model
697
+ )
698
+ image_prompts .append ((image_prompt_embeds , uncond_image_prompt_embeds ))
699
+
700
+ return image_prompts
701
+
675
702
def prep_ip_adapter_data (
676
703
self ,
677
704
context : InvocationContext ,
678
- ip_adapter : Optional [Union [IPAdapterField , list [IPAdapterField ]]],
705
+ ip_adapters : List [IPAdapterField ],
706
+ image_prompts : List [Tuple [torch .Tensor , torch .Tensor ]],
679
707
exit_stack : ExitStack ,
680
708
latent_height : int ,
681
709
latent_width : int ,
682
710
dtype : torch .dtype ,
683
- ) -> Optional [list [IPAdapterData ]]:
684
- """If IP-Adapter is enabled, then this function loads the requisite models, and adds the image prompt embeddings
685
- to the `conditioning_data` (in-place).
686
- """
687
- if ip_adapter is None :
688
- return None
689
-
690
- # ip_adapter could be a list or a single IPAdapterField. Normalize to a list here.
691
- if not isinstance (ip_adapter , list ):
692
- ip_adapter = [ip_adapter ]
693
-
694
- if len (ip_adapter ) == 0 :
695
- return None
696
-
711
+ ) -> Optional [List [IPAdapterData ]]:
712
+ """If IP-Adapter is enabled, then this function loads the requisite models and adds the image prompt conditioning data."""
697
713
ip_adapter_data_list = []
698
- for single_ip_adapter in ip_adapter :
699
- ip_adapter_model : Union [IPAdapter , IPAdapterPlus ] = exit_stack .enter_context (
700
- context .models .load (single_ip_adapter .ip_adapter_model )
701
- )
702
-
703
- image_encoder_model_info = context .models .load (single_ip_adapter .image_encoder_model )
704
- # `single_ip_adapter.image` could be a list or a single ImageField. Normalize to a list here.
705
- single_ipa_image_fields = single_ip_adapter .image
706
- if not isinstance (single_ipa_image_fields , list ):
707
- single_ipa_image_fields = [single_ipa_image_fields ]
708
-
709
- single_ipa_images = [context .images .get_pil (image .image_name ) for image in single_ipa_image_fields ]
710
-
711
- # TODO(ryand): With some effort, the step of running the CLIP Vision encoder could be done before any other
712
- # models are needed in memory. This would help to reduce peak memory utilization in low-memory environments.
713
- with image_encoder_model_info as image_encoder_model :
714
- assert isinstance (image_encoder_model , CLIPVisionModelWithProjection )
715
- # Get image embeddings from CLIP and ImageProjModel.
716
- image_prompt_embeds , uncond_image_prompt_embeds = ip_adapter_model .get_image_embeds (
717
- single_ipa_images , image_encoder_model
718
- )
714
+ for single_ip_adapter , (image_prompt_embeds , uncond_image_prompt_embeds ) in zip (
715
+ ip_adapters , image_prompts , strict = True
716
+ ):
717
+ ip_adapter_model = exit_stack .enter_context (context .models .load (single_ip_adapter .ip_adapter_model ))
719
718
720
- mask = single_ip_adapter .mask
721
- if mask is not None :
722
- mask = context .tensors .load (mask .tensor_name )
719
+ mask_field = single_ip_adapter .mask
720
+ mask = context .tensors .load (mask_field .tensor_name ) if mask_field is not None else None
723
721
mask = self ._preprocess_regional_prompt_mask (mask , latent_height , latent_width , dtype = dtype )
724
722
725
723
ip_adapter_data_list .append (
@@ -734,7 +732,7 @@ def prep_ip_adapter_data(
734
732
)
735
733
)
736
734
737
- return ip_adapter_data_list
735
+ return ip_adapter_data_list if len ( ip_adapter_data_list ) > 0 else None
738
736
739
737
def run_t2i_adapters (
740
738
self ,
@@ -855,6 +853,16 @@ def init_scheduler(
855
853
# At some point, someone decided that schedulers that accept a generator should use the original seed with
856
854
# all bits flipped. I don't know the original rationale for this, but now we must keep it like this for
857
855
# reproducibility.
856
+ #
857
+ # These Invoke-supported schedulers accept a generator as of 2024-06-04:
858
+ # - DDIMScheduler
859
+ # - DDPMScheduler
860
+ # - DPMSolverMultistepScheduler
861
+ # - EulerAncestralDiscreteScheduler
862
+ # - EulerDiscreteScheduler
863
+ # - KDPM2AncestralDiscreteScheduler
864
+ # - LCMScheduler
865
+ # - TCDScheduler
858
866
scheduler_step_kwargs .update ({"generator" : torch .Generator (device = device ).manual_seed (seed ^ 0xFFFFFFFF )})
859
867
if isinstance (scheduler , TCDScheduler ):
860
868
scheduler_step_kwargs .update ({"eta" : 1.0 })
@@ -912,6 +920,20 @@ def invoke(self, context: InvocationContext) -> LatentsOutput:
912
920
do_classifier_free_guidance = True ,
913
921
)
914
922
923
+ ip_adapters : List [IPAdapterField ] = []
924
+ if self .ip_adapter is not None :
925
+ # ip_adapter could be a list or a single IPAdapterField. Normalize to a list here.
926
+ if isinstance (self .ip_adapter , list ):
927
+ ip_adapters = self .ip_adapter
928
+ else :
929
+ ip_adapters = [self .ip_adapter ]
930
+
931
+ # If there are IP adapters, the following line runs the adapters' CLIPVision image encoders to return
932
+ # a series of image conditioning embeddings. This is being done here rather than in the
933
+ # big model context below in order to use less VRAM on low-VRAM systems.
934
+ # The image prompts are then passed to prep_ip_adapter_data().
935
+ image_prompts = self .prep_ip_adapter_image_prompts (context = context , ip_adapters = ip_adapters )
936
+
915
937
# get the unet's config so that we can pass the base to dispatch_progress()
916
938
unet_config = context .models .get_config (self .unet .unet .key )
917
939
@@ -930,11 +952,15 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
930
952
assert isinstance (unet_info .model , UNet2DConditionModel )
931
953
with (
932
954
ExitStack () as exit_stack ,
933
- unet_info as unet ,
955
+ unet_info . model_on_device () as ( model_state_dict , unet ) ,
934
956
ModelPatcher .apply_freeu (unet , self .unet .freeu_config ),
935
957
set_seamless (unet , self .unet .seamless_axes ), # FIXME
936
958
# Apply the LoRA after unet has been moved to its target device for faster patching.
937
- ModelPatcher .apply_lora_unet (unet , _lora_loader ()),
959
+ ModelPatcher .apply_lora_unet (
960
+ unet ,
961
+ loras = _lora_loader (),
962
+ model_state_dict = model_state_dict ,
963
+ ),
938
964
):
939
965
assert isinstance (unet , UNet2DConditionModel )
940
966
latents = latents .to (device = unet .device , dtype = unet .dtype )
@@ -970,7 +996,8 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
970
996
971
997
ip_adapter_data = self .prep_ip_adapter_data (
972
998
context = context ,
973
- ip_adapter = self .ip_adapter ,
999
+ ip_adapters = ip_adapters ,
1000
+ image_prompts = image_prompts ,
974
1001
exit_stack = exit_stack ,
975
1002
latent_height = latent_height ,
976
1003
latent_width = latent_width ,
@@ -1285,7 +1312,7 @@ def _(vae: AutoencoderTiny, image_tensor: torch.FloatTensor) -> torch.FloatTenso
1285
1312
title = "Blend Latents" ,
1286
1313
tags = ["latents" , "blend" ],
1287
1314
category = "latents" ,
1288
- version = "1.0.2 " ,
1315
+ version = "1.0.3 " ,
1289
1316
)
1290
1317
class BlendLatentsInvocation (BaseInvocation ):
1291
1318
"""Blend two latents using a given alpha. Latents must have same size."""
@@ -1364,7 +1391,7 @@ def slerp(
1364
1391
TorchDevice .empty_cache ()
1365
1392
1366
1393
name = context .tensors .save (tensor = blended_latents )
1367
- return LatentsOutput .build (latents_name = name , latents = blended_latents )
1394
+ return LatentsOutput .build (latents_name = name , latents = blended_latents , seed = self . latents_a . seed )
1368
1395
1369
1396
1370
1397
# The Crop Latents node was copied from @skunkworxdark's implementation here:
0 commit comments