invoke-ai
diff --git a/‎invokeai/app/invocations/compel.py
Lines changed: 4 additions & 4 deletions b/‎invokeai/app/invocations/compel.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎invokeai/app/invocations/create_gradient_mask.py
Lines changed: 2 additions & 1 deletion b/‎invokeai/app/invocations/create_gradient_mask.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎invokeai/app/invocations/denoise_latents.py
Lines changed: 13 additions & 2 deletions b/‎invokeai/app/invocations/denoise_latents.py
Lines changed: 13 additions & 2 deletions
diff --git a/‎invokeai/app/invocations/fields.py
Lines changed: 26 additions & 1 deletion b/‎invokeai/app/invocations/fields.py
Lines changed: 26 additions & 1 deletion
diff --git a/‎invokeai/app/invocations/grounding_dino.py
Lines changed: 100 additions & 0 deletions b/‎invokeai/app/invocations/grounding_dino.py
Lines changed: 100 additions & 0 deletions
diff --git a/‎invokeai/app/invocations/mask.py
Lines changed: 27 additions & 2 deletions b/‎invokeai/app/invocations/mask.py
Lines changed: 27 additions & 2 deletions
diff --git a/‎invokeai/app/invocations/primitives.py
Lines changed: 40 additions & 0 deletions b/‎invokeai/app/invocations/primitives.py
Lines changed: 40 additions & 0 deletions
@@ -80,12 +80,12 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
 
         with (
             # apply all patches while the model is on the target device
-            text_encoder_info.model_on_device() as (model_state_dict, text_encoder),
+            text_encoder_info.model_on_device() as (cached_weights, text_encoder),
             tokenizer_info as tokenizer,
             ModelPatcher.apply_lora_text_encoder(
                 text_encoder,
                 loras=_lora_loader(),
-                model_state_dict=model_state_dict,
+                cached_weights=cached_weights,
             ),
             # Apply CLIP Skip after LoRA to prevent LoRA application from failing on skipped layers.
             ModelPatcher.apply_clip_skip(text_encoder, self.clip.skipped_layers),
@@ -175,13 +175,13 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
 
         with (
             # apply all patches while the model is on the target device
-            text_encoder_info.model_on_device() as (state_dict, text_encoder),
+            text_encoder_info.model_on_device() as (cached_weights, text_encoder),
             tokenizer_info as tokenizer,
             ModelPatcher.apply_lora(
                 text_encoder,
                 loras=_lora_loader(),
                 prefix=lora_prefix,
-                model_state_dict=state_dict,
+                cached_weights=cached_weights,
             ),
             # Apply CLIP Skip after LoRA to prevent LoRA application from failing on skipped layers.
             ModelPatcher.apply_clip_skip(text_encoder, clip_field.skipped_layers),
 
@@ -39,7 +39,7 @@ class GradientMaskOutput(BaseInvocationOutput):
     title="Create Gradient Mask",
     tags=["mask", "denoise"],
     category="latents",
-    version="1.1.0",
+    version="1.2.0",
 )
 class CreateGradientMaskInvocation(BaseInvocation):
     """Creates mask for denoising model run."""
@@ -93,6 +93,7 @@ def invoke(self, context: InvocationContext) -> GradientMaskOutput:
 
             # redistribute blur so that the original edges are 0 and blur outwards to 1
             blur_tensor = (blur_tensor - 0.5) * 2
+            blur_tensor[blur_tensor < 0] = 0.0
 
             threshold = 1 - self.minimum_denoise
 
 
@@ -62,6 +62,7 @@
 from invokeai.backend.stable_diffusion.extensions.freeu import FreeUExt
 from invokeai.backend.stable_diffusion.extensions.inpaint import InpaintExt
 from invokeai.backend.stable_diffusion.extensions.inpaint_model import InpaintModelExt
+from invokeai.backend.stable_diffusion.extensions.lora import LoRAExt
 from invokeai.backend.stable_diffusion.extensions.preview import PreviewExt
 from invokeai.backend.stable_diffusion.extensions.rescale_cfg import RescaleCFGExt
 from invokeai.backend.stable_diffusion.extensions.seamless import SeamlessExt
@@ -845,6 +846,16 @@ def step_callback(state: PipelineIntermediateState) -> None:
         if self.unet.freeu_config:
             ext_manager.add_extension(FreeUExt(self.unet.freeu_config))
 
+        ### lora
+        if self.unet.loras:
+            for lora_field in self.unet.loras:
+                ext_manager.add_extension(
+                    LoRAExt(
+                        node_context=context,
+                        model_id=lora_field.lora,
+                        weight=lora_field.weight,
+                    )
+                )
         ### seamless
         if self.unet.seamless_axes:
             ext_manager.add_extension(SeamlessExt(self.unet.seamless_axes))
@@ -964,14 +975,14 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
         assert isinstance(unet_info.model, UNet2DConditionModel)
         with (
             ExitStack() as exit_stack,
-            unet_info.model_on_device() as (model_state_dict, unet),
+            unet_info.model_on_device() as (cached_weights, unet),
             ModelPatcher.apply_freeu(unet, self.unet.freeu_config),
             SeamlessExt.static_patch_model(unet, self.unet.seamless_axes),  # FIXME
             # Apply the LoRA after unet has been moved to its target device for faster patching.
             ModelPatcher.apply_lora_unet(
                 unet,
                 loras=_lora_loader(),
-                model_state_dict=model_state_dict,
+                cached_weights=cached_weights,
             ),
         ):
             assert isinstance(unet, UNet2DConditionModel)
 
@@ -1,7 +1,7 @@
 from enum import Enum
 from typing import Any, Callable, Optional, Tuple
 
-from pydantic import BaseModel, ConfigDict, Field, RootModel, TypeAdapter
+from pydantic import BaseModel, ConfigDict, Field, RootModel, TypeAdapter, model_validator
 from pydantic.fields import _Unset
 from pydantic_core import PydanticUndefined
 
@@ -242,6 +242,31 @@ class ConditioningField(BaseModel):
     )
 
 
+class BoundingBoxField(BaseModel):
+    """A bounding box primitive value."""
+
+    x_min: int = Field(ge=0, description="The minimum x-coordinate of the bounding box (inclusive).")
+    x_max: int = Field(ge=0, description="The maximum x-coordinate of the bounding box (exclusive).")
+    y_min: int = Field(ge=0, description="The minimum y-coordinate of the bounding box (inclusive).")
+    y_max: int = Field(ge=0, description="The maximum y-coordinate of the bounding box (exclusive).")
+
+    score: Optional[float] = Field(
+        default=None,
+        ge=0.0,
+        le=1.0,
+        description="The score associated with the bounding box. In the range [0, 1]. This value is typically set "
+        "when the bounding box was produced by a detector and has an associated confidence score.",
+    )
+
+    @model_validator(mode="after")
+    def check_coords(self):
+        if self.x_min > self.x_max:
+            raise ValueError(f"x_min ({self.x_min}) is greater than x_max ({self.x_max}).")
+        if self.y_min > self.y_max:
+            raise ValueError(f"y_min ({self.y_min}) is greater than y_max ({self.y_max}).")
+        return self
+
+
 class MetadataField(RootModel[dict[str, Any]]):
     """
     Pydantic model for metadata with custom root of type dict[str, Any].
 
@@ -0,0 +1,100 @@
+from pathlib import Path
+from typing import Literal
+
+import torch
+from PIL import Image
+from transformers import pipeline
+from transformers.pipelines import ZeroShotObjectDetectionPipeline
+
+from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
+from invokeai.app.invocations.fields import BoundingBoxField, ImageField, InputField
+from invokeai.app.invocations.primitives import BoundingBoxCollectionOutput
+from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.image_util.grounding_dino.detection_result import DetectionResult
+from invokeai.backend.image_util.grounding_dino.grounding_dino_pipeline import GroundingDinoPipeline
+
+GroundingDinoModelKey = Literal["grounding-dino-tiny", "grounding-dino-base"]
+GROUNDING_DINO_MODEL_IDS: dict[GroundingDinoModelKey, str] = {
+    "grounding-dino-tiny": "IDEA-Research/grounding-dino-tiny",
+    "grounding-dino-base": "IDEA-Research/grounding-dino-base",
+}
+
+
+@invocation(
+    "grounding_dino",
+    title="Grounding DINO (Text Prompt Object Detection)",
+    tags=["prompt", "object detection"],
+    category="image",
+    version="1.0.0",
+)
+class GroundingDinoInvocation(BaseInvocation):
+    """Runs a Grounding DINO model. Performs zero-shot bounding-box object detection from a text prompt."""
+
+    # Reference:
+    # - https://arxiv.org/pdf/2303.05499
+    # - https://huggingface.co/docs/transformers/v4.43.3/en/model_doc/grounding-dino#grounded-sam
+    # - https://github.com/NielsRogge/Transformers-Tutorials/blob/a39f33ac1557b02ebfb191ea7753e332b5ca933f/Grounding%20DINO/GroundingDINO_with_Segment_Anything.ipynb
+
+    model: GroundingDinoModelKey = InputField(description="The Grounding DINO model to use.")
+    prompt: str = InputField(description="The prompt describing the object to segment.")
+    image: ImageField = InputField(description="The image to segment.")
+    detection_threshold: float = InputField(
+        description="The detection threshold for the Grounding DINO model. All detected bounding boxes with scores above this threshold will be returned.",
+        ge=0.0,
+        le=1.0,
+        default=0.3,
+    )
+
+    @torch.no_grad()
+    def invoke(self, context: InvocationContext) -> BoundingBoxCollectionOutput:
+        # The model expects a 3-channel RGB image.
+        image_pil = context.images.get_pil(self.image.image_name, mode="RGB")
+
+        detections = self._detect(
+            context=context, image=image_pil, labels=[self.prompt], threshold=self.detection_threshold
+        )
+
+        # Convert detections to BoundingBoxCollectionOutput.
+        bounding_boxes: list[BoundingBoxField] = []
+        for detection in detections:
+            bounding_boxes.append(
+                BoundingBoxField(
+                    x_min=detection.box.xmin,
+                    x_max=detection.box.xmax,
+                    y_min=detection.box.ymin,
+                    y_max=detection.box.ymax,
+                    score=detection.score,
+                )
+            )
+        return BoundingBoxCollectionOutput(collection=bounding_boxes)
+
+    @staticmethod
+    def _load_grounding_dino(model_path: Path):
+        grounding_dino_pipeline = pipeline(
+            model=str(model_path),
+            task="zero-shot-object-detection",
+            local_files_only=True,
+            # TODO(ryand): Setting the torch_dtype here doesn't work. Investigate whether fp16 is supported by the
+            # model, and figure out how to make it work in the pipeline.
+            # torch_dtype=TorchDevice.choose_torch_dtype(),
+        )
+        assert isinstance(grounding_dino_pipeline, ZeroShotObjectDetectionPipeline)
+        return GroundingDinoPipeline(grounding_dino_pipeline)
+
+    def _detect(
+        self,
+        context: InvocationContext,
+        image: Image.Image,
+        labels: list[str],
+        threshold: float = 0.3,
+    ) -> list[DetectionResult]:
+        """Use Grounding DINO to detect bounding boxes for a set of labels in an image."""
+        # TODO(ryand): I copied this "."-handling logic from the transformers example code. Test it and see if it
+        # actually makes a difference.
+        labels = [label if label.endswith(".") else label + "." for label in labels]
+
+        with context.models.load_remote_model(
+            source=GROUNDING_DINO_MODEL_IDS[self.model], loader=GroundingDinoInvocation._load_grounding_dino
+        ) as detector:
+            assert isinstance(detector, GroundingDinoPipeline)
+            return detector.detect(image=image, candidate_labels=labels, threshold=threshold)
@@ -1,9 +1,10 @@
 import numpy as np
 import torch
+from PIL import Image
 
 from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, InvocationContext, invocation
-from invokeai.app.invocations.fields import ImageField, InputField, TensorField, WithMetadata
-from invokeai.app.invocations.primitives import MaskOutput
+from invokeai.app.invocations.fields import ImageField, InputField, TensorField, WithBoard, WithMetadata
+from invokeai.app.invocations.primitives import ImageOutput, MaskOutput
 
 
 @invocation(
@@ -118,3 +119,27 @@ def invoke(self, context: InvocationContext) -> MaskOutput:
             height=mask.shape[1],
             width=mask.shape[2],
         )
+
+
+@invocation(
+    "tensor_mask_to_image",
+    title="Tensor Mask to Image",
+    tags=["mask"],
+    category="mask",
+    version="1.0.0",
+)
+class MaskTensorToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
+    """Convert a mask tensor to an image."""
+
+    mask: TensorField = InputField(description="The mask tensor to convert.")
+
+    def invoke(self, context: InvocationContext) -> ImageOutput:
+        mask = context.tensors.load(self.mask.tensor_name)
+        # Ensure that the mask is binary.
+        if mask.dtype != torch.bool:
+            mask = mask > 0.5
+        mask_np = (mask.float() * 255).byte().cpu().numpy()
+
+        mask_pil = Image.fromarray(mask_np, mode="L")
+        image_dto = context.images.save(image=mask_pil)
+        return ImageOutput.build(image_dto)
@@ -7,6 +7,7 @@
 from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
 from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
 from invokeai.app.invocations.fields import (
+    BoundingBoxField,
     ColorField,
     ConditioningField,
     DenoiseMaskField,
@@ -469,3 +470,42 @@ def invoke(self, context: InvocationContext) -> ConditioningCollectionOutput:
 
 
 # endregion
+
+# region BoundingBox
+
+
+@invocation_output("bounding_box_output")
+class BoundingBoxOutput(BaseInvocationOutput):
+    """Base class for nodes that output a single bounding box"""
+
+    bounding_box: BoundingBoxField = OutputField(description="The output bounding box.")
+
+
+@invocation_output("bounding_box_collection_output")
+class BoundingBoxCollectionOutput(BaseInvocationOutput):
+    """Base class for nodes that output a collection of bounding boxes"""
+
+    collection: list[BoundingBoxField] = OutputField(description="The output bounding boxes.", title="Bounding Boxes")
+
+
+@invocation(
+    "bounding_box",
+    title="Bounding Box",
+    tags=["primitives", "segmentation", "collection", "bounding box"],
+    category="primitives",
+    version="1.0.0",
+)
+class BoundingBoxInvocation(BaseInvocation):
+    """Create a bounding box manually by supplying box coordinates"""
+
+    x_min: int = InputField(default=0, description="x-coordinate of the bounding box's top left vertex")
+    y_min: int = InputField(default=0, description="y-coordinate of the bounding box's top left vertex")
+    x_max: int = InputField(default=0, description="x-coordinate of the bounding box's bottom right vertex")
+    y_max: int = InputField(default=0, description="y-coordinate of the bounding box's bottom right vertex")
+
+    def invoke(self, context: InvocationContext) -> BoundingBoxOutput:
+        bounding_box = BoundingBoxField(x_min=self.x_min, y_min=self.y_min, x_max=self.x_max, y_max=self.y_max)
+        return BoundingBoxOutput(bounding_box=bounding_box)
+
+
+# endregion