From 8ed6683baea27b13ddb4cd925ec87f4ad00a8d49 Mon Sep 17 00:00:00 2001
From: SoundCloudRapper365 <sriramku@outlook.com>
Date: Tue, 6 Aug 2024 21:06:41 -0700
Subject: [PATCH 01/42] init

---
 .../action/SegmentAllItems.action             |   5 +
 ada_feeding_msgs/msg/Mask.msg                 |   3 +
 .../segment_all_items.py                      | 129 ++++++++++++++++++
 3 files changed, 137 insertions(+)
 create mode 100644 ada_feeding_perception/ada_feeding_perception/segment_all_items.py

diff --git a/ada_feeding_msgs/action/SegmentAllItems.action b/ada_feeding_msgs/action/SegmentAllItems.action
index e1643dd0..3f0e7d0b 100644
--- a/ada_feeding_msgs/action/SegmentAllItems.action
+++ b/ada_feeding_msgs/action/SegmentAllItems.action
@@ -1,6 +1,8 @@
 # The interface for an action that gets an image from the camera and returns
 # the masks of all segmented items within that image.
 
+# The list of input semantic labels for the food items on the plate
+string[] input_labels
 ---
 # Possible return statuses
 uint8 STATUS_SUCCEEDED=0
@@ -17,6 +19,9 @@ std_msgs/Header header
 sensor_msgs/CameraInfo camera_info
 # Masks of all the detected items in the image
 ada_feeding_msgs/Mask[] detected_items
+# A list of semantic labels corresponding to each of the masks of detected
+# items in the image
+string[] item_labels
 ---
 # How much time the action has spent segmenting the food item
 builtin_interfaces/Duration elapsed_time
diff --git a/ada_feeding_msgs/msg/Mask.msg b/ada_feeding_msgs/msg/Mask.msg
index b1e7679d..1b43347e 100644
--- a/ada_feeding_msgs/msg/Mask.msg
+++ b/ada_feeding_msgs/msg/Mask.msg
@@ -19,6 +19,9 @@ float64 average_depth
 # An arbitrary ID that defines the segmented item
 string item_id
 
+# An ID that semantically labels a specific, segmented item
+string object_id
+
 # A score that indicates how confident the segemntation algorithm is in 
 # this mask.
 float64 confidence
diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
new file mode 100644
index 00000000..45cad189
--- /dev/null
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -0,0 +1,129 @@
+"""
+This file defines the SegmentAllItems class, which launches an action server that
+segments all food items in the latest image and defines each segmentation with a semantic
+label using GPT-4V, GroundingDINO, and Segment Anything.
+"""
+
+# Standard imports
+import os
+import threading
+from typing import Optional, Tuple, Union
+
+# Third-party imports
+import cv2
+from cv_bridge import CvBridge
+from efficient_sam.efficient_sam import build_efficient_sam
+import numpy as np
+import numpy.typing as npt
+from rcl_interfaces.msg import ParameterDescriptor, ParameterType
+import rclpy
+from rclpy.action import ActionServer, CancelResponse, GoalResponse
+from rclpy.action.server import ServerGoalHandle
+from rclpy.callback_groups import MutuallyExclusiveCallbackGroup
+from rclpy.executors import MultiThreadedExecutor
+from rclpy.node import Node
+from rclpy.parameter import Parameter
+from segment_anything import sam_model_registry, SamPredictor
+from sensor_msgs.msg import CameraInfo, CompressedImage, Image, RegionOfInterest
+import torch
+from torchvision import transforms
+
+# Local imports
+from ada_feeding_perception.action import SegmentAllItems
+from ada_feeding_msgs.msg import Mask
+from ada_feeding_perception.helpers import (
+    bbox_from_mask,
+    crop_image_mask_and_point,
+    cv2_image_to_ros_msg,
+    download_checkpoint,
+    get_connected_component,
+    get_img_msg_type,
+    ros_msg_to_cv2_image,
+)
+
+class SegmentAllItemsNode(Node):
+    """
+    The SegmentAllItemsNode launches an action server that segments all food 
+    items in the latest image and defines each segmentation with a semantic
+    label using GPT-4V, GroundingDINO, and Segment Anything.
+    """
+
+    def __init__(self):
+        """
+        Initialize the SegmentAllItemsNode.
+        """
+
+        super().__init__("segment_all_items")
+
+        # Check if cuda is available
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        # Load the parameters'
+
+        # Subscribe to the camera info topic, to get the camera intrinsics
+        self.camera_info = None
+        self.camera_info_lock = threading.Lock()
+        self.camera_info_subscriber = self.create_subscription(
+            CameraInfo,
+            "~/camera_info",
+            self.camera_info_callback,
+            1,
+            callback_group=MutuallyExclusiveCallbackGroup(),
+        )
+
+        # Subscribe to the aligned depth image topic, to store the latest depth image
+        # NOTE: We assume this is in the same frame as the RGB image
+        self.latest_depth_img_msg = None
+        self.latest_depth_img_msg_lock = threading.Lock()
+        aligned_depth_topic = "~/aligned_depth"
+        try:
+            aligned_depth_type = get_img_msg_type(aligned_depth_topic, self)
+        except ValueError as err:
+            self.get_logger().error(
+                f"Error getting type of depth image topic. Defaulting to CompressedImage. {err}"
+            )
+            aligned_depth_type = CompressedImage
+        self.depth_image_subscriber = self.create_subscription(
+            aligned_depth_type,
+            aligned_depth_topic,
+            self.depth_image_callback,
+            1,
+            callback_group=MutuallyExclusiveCallbackGroup(),
+        )
+
+        # Subscribe to the RGB image topic, to store the latest image
+        self.latest_img_msg = None
+        self.latest_img_msg_lock = threading.Lock()
+        image_topic = "~/image"
+        try:
+            image_type = get_img_msg_type(image_topic, self)
+        except ValueError as err:
+            self.get_logger().error(
+                f"Error getting type of image topic. Defaulting to CompressedImage. {err}"
+            )
+            image_type = CompressedImage
+        self.image_subscriber = self.create_subscription(
+            image_type,
+            image_topic,
+            self.image_callback,
+            1,
+            callback_group=MutuallyExclusiveCallbackGroup(),
+        )
+
+        # Convert between ROS and CV images
+        self.bridge = CvBridge()
+
+        # Create the Action Server.
+        # Note: remapping action names does not work: https://github.com/ros2/ros2/issues/1312
+        self._action_server = ActionServer(
+            self,
+            SegmentAllItems,
+            "SegmentAllItems",
+            self.execute_callback,
+            goal_callback=self.goal_callback,
+            cancel_callback=self.cancel_callback,
+            callback_group=MutuallyExclusiveCallbackGroup(),
+        )
+
+        
+

From dcb74b796418c4a354e45b34d5c14e158c128440 Mon Sep 17 00:00:00 2001
From: SoundCloudRapper365 <sriramku@outlook.com>
Date: Sat, 10 Aug 2024 18:03:33 -0700
Subject: [PATCH 02/42] added neccessary callback functions

---
 .../segment_all_items.py                      | 221 ++++++++++++++++++
 1 file changed, 221 insertions(+)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 45cad189..05305981 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -40,6 +40,11 @@
     get_img_msg_type,
     ros_msg_to_cv2_image,
 )
+from ada_feeding_perception.segment_from_point import (
+    initialize_sam,
+    initialize_efficient_sam,
+    generate_mask_msg,
+)
 
 class SegmentAllItemsNode(Node):
     """
@@ -125,5 +130,221 @@ def __init__(self):
             callback_group=MutuallyExclusiveCallbackGroup(),
         )
 
+    def read_params(
+        self,
+    ) -> Tuple[Parameter, Parameter, Parameter, Parameter, Parameter]:
+        """
+        Read the parameters for this node.
+
+        Returns
+        -------
+        """
+        (
+            sam_model_name,
+            sam_model_base_url,
+            efficient_sam_model_name,
+            efficient_sam_model_base_url,
+            groundingdino_model_path,
+            model_dir,
+            use_efficient_sam,
+            n_contender_masks,
+            rate_hz,
+            min_depth_mm,
+            max_depth_mm,
+        ) = self.declare_parameters(
+            "",
+            [
+                (
+                    "sam_model_name",
+                    None,
+                    ParameterDescriptor(
+                        name="sam_model_name",
+                        type=ParameterType.PARAMETER_STRING,
+                        description="The name of the model checkpoint to use for SAM",
+                        read_only=True,
+                    ),
+                ),
+                (
+                    "sam_model_base_url",
+                    None,
+                    ParameterDescriptor(
+                        name="sam_model_base_url",
+                        type=ParameterType.PARAMETER_STRING,
+                        description=(
+                            "The URL to download the model checkpoint from if "
+                            "it is not already downloaded for SAM"
+                        ),
+                        read_only=True,
+                    ),
+                ),
+                (
+                    "efficient_sam_model_name",
+                    None,
+                    ParameterDescriptor(
+                        name="efficient_sam_model_name",
+                        type=ParameterType.PARAMETER_STRING,
+                        description="The name of the model checkpoint to use for EfficientSAM",
+                        read_only=True,
+                    ),
+                ),
+                (
+                    "efficient_sam_model_base_url",
+                    None,
+                    ParameterDescriptor(
+                        name="efficient_sam_model_base_url",
+                        type=ParameterType.PARAMETER_STRING,
+                        description=(
+                            "The URL to download the model checkpoint from if "
+                            "it is not already downloaded for EfficientSAM"
+                        ),
+                        read_only=True,
+                    ),
+                ),
+                (
+                    "groundingdino_model_path",
+                    None,
+                    ParameterDescriptor(
+                        name="groundingdino_model_path",
+                        type=ParameterType.PARAMETER_STRING,
+                        description="The name of the model checkpoint to use for Open-GroundingDINO",
+                        read_only=True,
+                    ),
+                ),
+                (
+                    "model_dir",
+                    None,
+                    ParameterDescriptor(
+                        name="model_dir",
+                        type=ParameterType.PARAMETER_STRING,
+                        description=(
+                            "The location of the directory where the model "
+                            "checkpoint is / should be stored"
+                        ),
+                        read_only=True,
+                    ),
+                ),
+                (
+                    "use_efficient_sam",
+                    True,
+                    ParameterDescriptor(
+                        name="use_efficient_sam",
+                        type=ParameterType.PARAMETER_BOOL,
+                        description=("Whether to use EfficientSAM or SAM"),
+                        read_only=True,
+                    ),
+                ),
+                (
+                    "n_contender_masks",
+                    3,
+                    ParameterDescriptor(
+                        name="n_contender_masks",
+                        type=ParameterType.PARAMETER_INTEGER,
+                        description="The number of contender masks to return per point.",
+                        read_only=True,
+                    ),
+                ),
+                (
+                    "rate_hz",
+                    10.0,
+                    ParameterDescriptor(
+                        name="rate_hz",
+                        type=ParameterType.PARAMETER_DOUBLE,
+                        description="The rate at which to return feedback.",
+                        read_only=True,
+                    ),
+                ),
+                (
+                    "min_depth_mm",
+                    330,
+                    ParameterDescriptor(
+                        name="min_depth_mm",
+                        type=ParameterType.PARAMETER_INTEGER,
+                        description="The minimum depth in mm to consider in a mask.",
+                        read_only=True,
+                    ),
+                ),
+                (
+                    "max_depth_mm",
+                    10150000,
+                    ParameterDescriptor(
+                        name="max_depth_mm",
+                        type=ParameterType.PARAMETER_INTEGER,
+                        description="The maximum depth in mm to consider in a mask.",
+                        read_only=True,
+                    ),
+                ),
+            ],
+        )
+
+        if use_efficient_sam.value:
+            seg_model_name = efficient_sam_model_name.value
+            seg_model_base_url = efficient_sam_model_base_url.value
+        else:
+            seg_model_name = sam_model_name.value
+            seg_model_base_url = sam_model_base_url.value
+
+        return (
+            seg_model_name,
+            seg_model_base_url,
+            groundingdino_model_path.value,
+            model_dir.value,
+            use_efficient_sam.value,
+            n_contender_masks.value,
+            rate_hz.value,
+            min_depth_mm.value,
+            max_depth_mm.value,
+        )
+    
+    def camera_info_callback(self, msg: CameraInfo) -> None:
+        """
+        Store the latest camera info message.
+
+        Parameters
+        ----------
+        msg: The camera info message.
+        """
+        with self.camera_info_lock:
+            self.camera_info = msg
+
+    def depth_image_callback(self, msg: Union[Image, CompressedImage]) -> None:
+        """
+        Store the latest depth image message.
+
+        Parameters
+        ----------
+        msg: The depth image message.
+        """
+        with self.latest_depth_img_msg_lock:
+            self.latest_depth_img_msg = msg
+
+    def image_callback(self, msg: Union[Image, CompressedImage]) -> None:
+        """
+        Store the latest image message.
+
+        Parameters
+        ----------
+        msg: The image message.
+        """
+        with self.latest_img_msg_lock:
+            self.latest_img_msg = msg
+    
+def main(args=None):
+    """
+    Launch the ROS node and spin.
+    """
+    rclpy.init(args=args)
+
+    segment_all_items = SegmentAllItemsNode()
+
+    # Use a MultiThreadedExecutor to enable processing goals concurrently
+    executor = MultiThreadedExecutor(num_threads=5)
+
+    rclpy.spin(segment_all_items, executor=executor)
+
+
+if __name__ == "__main__":
+    main()
+
+
         
 

From c611e78416c88b30e29dfbce93c6c47f1db5077c Mon Sep 17 00:00:00 2001
From: SoundCloudRapper365 <sriramku@outlook.com>
Date: Mon, 12 Aug 2024 22:24:45 -0700
Subject: [PATCH 03/42] implemented functionality to run sam and groundingdino

---
 .../segment_all_items.py                      | 349 +++++++++++++++++-
 1 file changed, 336 insertions(+), 13 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 05305981..9f5ef5dd 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -24,6 +24,9 @@
 from rclpy.node import Node
 from rclpy.parameter import Parameter
 from segment_anything import sam_model_registry, SamPredictor
+from groundingdino.models import build_model
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import get_phrases_from_posmap, clean_slate_dict
 from sensor_msgs.msg import CameraInfo, CompressedImage, Image, RegionOfInterest
 import torch
 from torchvision import transforms
@@ -41,8 +44,6 @@
     ros_msg_to_cv2_image,
 )
 from ada_feeding_perception.segment_from_point import (
-    initialize_sam,
-    initialize_efficient_sam,
     generate_mask_msg,
 )
 
@@ -64,6 +65,26 @@ def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
         # Load the parameters'
+        (
+            seg_model_name,
+            seg_model_base_url,
+            groundingdino_config_path,
+            groundingdino_model_path,
+            model_dir,
+            self.use_efficient_sam,
+            self.rate_hz,
+            self.box_threshold,
+            self.text_threshold,
+            self.min_depth_mm,
+            self.max_depth_mm,
+        ) = self.read_params()
+
+        # Download the checkpoint for SAM/EfficientSAM if it doesn't exist
+        seg_model_path = os.path.join(model_dir, seg_model_name)
+        if not os.path.isfile(seg_model_path):
+            self.get_logger().info("Model checkpoint does not exist. Downloading...")
+            download_checkpoint(seg_model_name, model_dir, seg_model_base_url)
+            self.get_logger().info(f"Model checkpoint downloaded {seg_model_path}.")
 
         # Subscribe to the camera info topic, to get the camera intrinsics
         self.camera_info = None
@@ -115,6 +136,15 @@ def __init__(self):
             callback_group=MutuallyExclusiveCallbackGroup(),
         )
 
+        # Initialize GroundingDINO 
+        self.initialize_grounding_dino(groundingdino_config_path, groundingdino_model_path)
+
+        # Initialize Segment Anything
+        if self.use_efficient_sam:
+            self.initialize_efficient_sam(seg_model_name, seg_model_path)
+        else:
+            self.initialize_sam(seg_model_name, seg_model_path)
+
         # Convert between ROS and CV images
         self.bridge = CvBridge()
 
@@ -144,11 +174,13 @@ def read_params(
             sam_model_base_url,
             efficient_sam_model_name,
             efficient_sam_model_base_url,
+            groundingdino_config_path,
             groundingdino_model_path,
             model_dir,
             use_efficient_sam,
-            n_contender_masks,
             rate_hz,
+            box_threshold,
+            text_threshold,
             min_depth_mm,
             max_depth_mm,
         ) = self.declare_parameters(
@@ -200,6 +232,16 @@ def read_params(
                         read_only=True,
                     ),
                 ),
+                (
+                    "groundingdino_config_path",
+                    None,
+                    ParameterDescriptor(
+                        name="groundingdino_config_path",
+                        type=ParameterType.PARAMETER_STRING,
+                        description="The name of the configuration file to use for Open-GroundingDINO",
+                        read_only=True,
+                    ),
+                )
                 (
                     "groundingdino_model_path",
                     None,
@@ -234,22 +276,34 @@ def read_params(
                     ),
                 ),
                 (
-                    "n_contender_masks",
-                    3,
+                    "rate_hz",
+                    10.0,
                     ParameterDescriptor(
-                        name="n_contender_masks",
-                        type=ParameterType.PARAMETER_INTEGER,
-                        description="The number of contender masks to return per point.",
+                        name="rate_hz",
+                        type=ParameterType.PARAMETER_DOUBLE,
+                        description="The rate at which to return feedback.",
                         read_only=True,
                     ),
                 ),
                 (
-                    "rate_hz",
-                    10.0,
+                    "box_threshold",
+                    30.0,
                     ParameterDescriptor(
-                        name="rate_hz",
+                        name="box_threshold",
                         type=ParameterType.PARAMETER_DOUBLE,
-                        description="The rate at which to return feedback.",
+                        description="The lower threshold for the bounding box detections" + 
+                                    "by Open-GroundingDINO.",
+                        read_only=True,
+                    ),
+                ),
+                (
+                    "text_threshold",
+                    25.0,
+                    ParameterDescriptor(
+                        name="text_threshold",
+                        type=ParameterType.PARAMETER_DOUBLE,
+                        description="The lower threshold for the text detections" +
+                                    "by Open-GroundingDINO.",
                         read_only=True,
                     ),
                 ),
@@ -286,15 +340,119 @@ def read_params(
         return (
             seg_model_name,
             seg_model_base_url,
+            groundingdino_config_path.value,
             groundingdino_model_path.value,
             model_dir.value,
             use_efficient_sam.value,
-            n_contender_masks.value,
             rate_hz.value,
+            box_threshold.value,
+            text_threshold.value,
             min_depth_mm.value,
             max_depth_mm.value,
         )
     
+    def initialize_grounding_dino(
+        self, groundingdino_config_path: str, groundingdino_model_path: str
+    ) -> None:
+        """
+        Initialize the Open-GroundingDINO model.
+
+        Parameters
+        ----------
+        groundingdino_config_path: The path to the Open-GroundingDINO configuration file.
+        groundingdino_model_path: The path to the Open-GroundingDINO model checkpoint.
+        """
+        self.get_logger().info("Initializing Open-GroundingDINO...")
+
+        # Get model configuration arguments from the configuration file
+        config_args = SLConfig.fromfile(groundingdino_config_path)
+        config_args.device = self.device
+        groundingdino = build_model(config_args)
+
+        # Load the GroundingDINO model checkpoint
+        checkpoint = torch.load(groundingdino_model_path, map_location="cpu")
+        load_log = groundingdino.load_state_dict(
+            clean_slate_dict(checkpoint["model"]), strict=False
+        )
+        self.get_logger().info(f"Loaded model checkpoint: {load_log}")
+        _ = groundingdino.eval()
+        self.groundingdino = groundingdino
+
+    # Move bottom two functions to helpers file and import them for 
+    # both segment_all_items.py and segment_from_point.py    
+    def initialize_sam(self, model_name: str, model_path: str) -> None:
+        """
+        Initialize all attributes needed for food segmentation with SAM.
+
+        This includes loading the SAM, launching the action
+        server, and more. Note that we are guarenteed the model exists since
+        it was downloaded in the __init__ function of this class.
+
+        Parameters
+        ----------
+        model_name: The name of the model to load.
+        model_path: The path to the model checkpoint to load.
+
+        Raises
+        ------
+        ValueError if the model name does not contain vit_h, vit_l, or vit_b
+        """
+        self.get_logger().info("Initializing SAM...")
+        # Load the model and move it to the specified device
+        if "vit_b" in model_name:  # base model
+            model_type = "vit_b"
+        elif "vit_l" in model_name:  # large model
+            model_type = "vit_l"
+        elif "vit_h" in model_name:  # huge model
+            model_type = "vit_h"
+        else:
+            raise ValueError(f"Unknown model type {model_name}")
+        sam = sam_model_registry[model_type](checkpoint=model_path)
+        sam.to(device=self.device)
+
+        # Create the predictor
+        # NOTE: If we allow for concurrent goals, this should be protected by
+        # a lock.
+        self.sam = SamPredictor(sam)
+
+        self.get_logger().info("...Done!")
+
+    def initialize_efficient_sam(self, model_name: str, model_path: str) -> None:
+        """
+        Initialize all attributes needed for food segmentation with EfficientSAM.
+
+        This includes loading the EfficientSAM model, launching the action
+        server, and more. Note that we are guarenteed the model exists since
+        it was downloaded in the __init__ function of this class.
+
+        Parameters
+        ----------
+        model_name: The name of the model to load.
+        model_path: The path to the model checkpoint to load.
+
+        Raises
+        ------
+        ValueError if the model name does not contain efficient_sam
+        """
+        self.get_logger().info("Initializing EfficientSAM...")
+        # Hardcoded from https://github.com/yformer/EfficientSAM/blob/main/efficient_sam/build_efficient_sam.py
+        if "vits" in model_name:
+            encoder_patch_embed_dim = 384
+            encoder_num_heads = 6
+        elif "vitt" in model_name:
+            encoder_patch_embed_dim = 192
+            encoder_num_heads = 3
+        else:
+            raise ValueError(f"Unknown model type {model_name}")
+        self.efficient_sam = build_efficient_sam(
+            encoder_patch_embed_dim=encoder_patch_embed_dim,
+            encoder_num_heads=encoder_num_heads,
+            checkpoint=model_path,
+        ).eval()
+        self.efficient_sam.to(device=self.device)
+
+        self.get_logger().info("...Done!")
+    
     def camera_info_callback(self, msg: CameraInfo) -> None:
         """
         Store the latest camera info message.
@@ -327,6 +485,171 @@ def image_callback(self, msg: Union[Image, CompressedImage]) -> None:
         """
         with self.latest_img_msg_lock:
             self.latest_img_msg = msg
+
+    def run_sam(
+        self, 
+        image: npt.NDArray, 
+        seed_point: Tuple[int, int], 
+        bbox: Tuple[int, int, int, int], 
+        prompt: int,
+    ):
+        """
+        Run SAM on the image.
+
+        Parameters
+        ----------
+        image: The image to segment, in BGR.
+        seed_point: The seed point for SAM to segment from.
+        bbox: The bounding box prompt for SAM to segment. 
+        prompt: The prompt to use for SAM. If 0, use the seed point prompt. 
+                If 1, use the bounding box prompt.
+
+        Returns
+        -------
+        A list of tuples containing the confidence and mask for each segmentation.
+        """
+        self.get_logger().info("Segmenting image with SAM...")
+
+        # Convert image from BGR to RGB for Segment Anything
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        # Run SAM on the image using the input prompt
+        self.sam.set_image(image)
+        if prompt == 0:
+            masks, scores, _ = self.sam.set_seed_point(
+                point_coords=seed_point,
+                point_labels=[1],
+                multimask_output=True,
+            )
+        else:
+            masks, scores, _ = self.sam.set_bbox(
+                box=bbox,
+                multimask_output=True,
+            )
+        
+        return masks, scores
+    
+    def run_efficient_sam(
+        self, 
+        image: npt.NDArray, 
+        seed_point: Tuple[int, int], 
+        bbox: Tuple[int, int, int, int], 
+        prompt: int,
+    ):
+        """
+        Run EfficientSAM on the image.
+
+        Parameters
+        ----------
+        image: The image to segment, in BGR.
+        seed_point: The seed point for EfficientSAM to segment from.
+        bbox: The bounding box prompt for EfficientSAM to segment. 
+        prompt: The prompt to use for EfficientSAM. If 0, use the seed point prompt. 
+                If 1, use the bounding box prompt.
+
+        Returns
+        -------
+        masks: The masks for each segmentation.
+        scores: The confidence scores for each segmentation.
+        """
+        self.get_logger().info("Segmenting image with EfficientSAM...")
+
+        # Convert image from BGR to RGB for Segment Anything
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        # Convert the image to a tensor
+        image_tensor = transforms.ToTensor()(image).to(self.device)
+
+        # Convert input prompt (seed point or bounding box) to a tensor
+        if prompt == 0:
+            prompt_tensor = torch.tensor(np.array(seed_point).reshape((1, 1, 1, 2))).to(
+                device=self.device
+            )
+            
+            # Define the labels for the input prompt
+            prompt_labels = torch.tensor([[[1]]]).to(device=self.device)
+        else:
+            prompt_tensor = torch.reshape(torch.tensor(bbox), [1, 1, 2, 2]).to(self.device)
+
+            # Define the labels for the input prompt
+            prompt_labels = torch.reshape(torch.tensor([2, 3]), [1, 1, 2]).to(self.device)
+
+        # Run EfficientSAM on the image using the input prompt
+        predicted_logits, predicted_iou = self.efficient_sam(
+            image_tensor[None, ...],
+            prompt_tensor,
+            prompt_labels,
+        )
+        sorted_ids = torch.argsort(predicted_iou, dim=-1, descending=True)
+        predicted_iou = torch.take_along_dim(predicted_iou, sorted_ids, dim=2)
+        predicted_logits = torch.take_along_dim(
+            predicted_logits, sorted_ids[..., None, None], dim=2
+        )
+        masks = torch.ge(predicted_logits[0, 0, :, :, :], 0).cpu().detach().numpy()
+        scores = predicted_iou[0, 0, :].cpu().detach().numpy()
+    
+        return masks, scores
+    
+    def run_grounding_dino(
+        self, 
+        image: npt.NDArray, 
+        caption: str, 
+        box_threshold: int, 
+        text_threshold: int, 
+    ):
+        """
+        Run Open-GroundingDINO on the image.
+
+        Parameters
+        ----------
+        image: The image to retrieve semantically labeled bounding boxes from, in BGR.
+        caption: The caption to use for Open-GroundingDINO.
+        box_threshold: The threshold for the bounding box.
+        text_threshold: The threshold for the text.
+
+        Returns
+        -------
+        bbox_predictions: A dictionary containing the bounding boxes for each food item label 
+                        detected from the image.
+        """
+        self.get_logger().info("Running Open-GroundingDINO...")
+
+        # Convert image from BGR to RGB for Open-GroundingDINO
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        # Lowercase and strip the caption
+        caption = caption.lower().strip()
+
+        # Run Open-GroundingDINO on the image using the input caption
+        with torch.no_grad():
+            outputs = self.groundingdino(
+                image[None],
+                caption=[caption],
+            )
+            logits = outputs["pred_logits"].sigmoid()[0]
+            boxes = outputs["pred_boxes"][0]
+        
+        # Filter the output based on the box and text thresholds
+        bbox_predictions = {}
+        logits_filt = logits.cpu().clone()
+        boxes_filt = boxes.cpu().clone()
+        filt_thresh_mask = logits_filt.max(dim=1)[0] > box_threshold
+        logits_filt = logits_filt[filt_thresh_mask]
+        boxes_filt = boxes_filt[filt_thresh_mask]
+
+        # Tokenize the caption
+        tokenizer = self.groundingdino.tokenizer
+        caption_tokens = tokenizer(caption)
+
+        # Build the dictionary of bounding boxes for each food item label detected
+        for logit, box in zip(logits_filt, boxes_filt):
+            # Predict phrases based on the bounding boxes and the text threshold
+            phrase = get_phrases_from_posmap(logit > text_threshold, caption_tokens, tokenizer)
+            if logit.max() > text_threshold:
+                bbox_predictions[phrase].append(box.cpu().numpy())
+
+        return bbox_predictions
+
     
 def main(args=None):
     """

From 4fb02588583f0c1e3c714c7b5d441da382825e14 Mon Sep 17 00:00:00 2001
From: SoundCloudRapper365 <sriramku@outlook.com>
Date: Tue, 13 Aug 2024 23:19:29 -0700
Subject: [PATCH 04/42] wrote vision pipeline and execute callback

---
 .../segment_all_items.py                      | 118 +++++++++++++++++-
 1 file changed, 117 insertions(+), 1 deletion(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 9f5ef5dd..bc2cad4a 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -535,7 +535,7 @@ def run_efficient_sam(
         seed_point: Tuple[int, int], 
         bbox: Tuple[int, int, int, int], 
         prompt: int,
-    ):
+    ) -> Tuple[npt.NDArray, npt.NDArray]:
         """
         Run EfficientSAM on the image.
 
@@ -649,6 +649,122 @@ def run_grounding_dino(
                 bbox_predictions[phrase].append(box.cpu().numpy())
 
         return bbox_predictions
+    
+    async def run_vision_pipeline(self, image: npt.NDArray, caption: str):
+        """
+        Run the vision pipeline consisting of GroundingDINO and EfficientSAM on the image.
+        The caption and latest image are prompted to GroundingDINO which outputs bounding boxes
+        for each food item label in the caption detected in the image. The detected items are then
+        segmented by passing in the bounding box detections into EfficientSAM which outputs
+        pixel-wise masks for each bounding box. The top masks are then returned along with the 
+        food item label for each mask as a dictionary. 
+
+        Parameters
+        ----------
+        image: The image to segment, in BGR.
+        caption: The caption to use for GroundingDINO containing all the food items 
+                 detected in the image.
+
+        Returns
+        -------
+        mask_predictions: A dictionary containing the pixel-wise masks for each food item label 
+                          detected from running the GroundingDINO + EfficientSAM vision pipeline
+                          on the image.
+        """
+        # Run Open-GroundingDINO on the image
+        bbox_predictions = self.run_grounding_dino(image, caption, self.box_threshold, self.text_threshold)
+
+        # Get the top contender mask for each food item label detected by 
+        # GroundingDINO using EfficientSAM
+        mask_predictions = {}
+        for phrase, boxes in bbox_predictions.items():
+            for box in boxes:
+                masks, scores = self.run_efficient_sam(image, None, box, 1)
+                if len(masks) > 0:
+                    mask_predictions[phrase] = masks[0]
+                break
+            
+        return mask_predictions 
+
+    async def execute_callback(
+        self, goal_handle: ServerGoalHandle
+    ) -> SegmentAllItems.Result:
+        """
+        Execute the action server callback.
+
+        Parameters
+        ----------
+        goal_handle: The goal handle for the action server.
+
+        Returns
+        -------
+        The result of the action server containing masks for all food items 
+        detected in the image.
+        """
+        starting_time = self.get_clock().now()
+        self.get_logger().info("Received a new goal!")
+
+        # Get the latest image and camera info
+        with self.latest_img_msg_lock:
+            latest_img_msg = self.latest_img_msg
+        with self.camera_info_lock:
+            camera_info = self.camera_info
+
+        # Check if the image and camera info are available
+        if latest_img_msg is None or camera_info is None:
+            self.get_logger().error("Image or camera info not available.")
+            return SegmentAllItems.Result()
+
+        # Convert the input label list from goal request to a single string caption
+        # for GroundingDINO
+        caption = '. '.join(goal_handle.request.input_labels.lower().strip())
+        caption += '.'
+
+        # Start running the vision pipeline as a separate thread
+        rate = self.create_rate(self.rate_hz)
+        vision_pipeline_task = self.executor.create_task(
+            self.run_vision_pipeline, latest_img_msg, caption
+        )
+
+        # Wait for the vision pipeline to finish and keep publishing 
+        # feedback (elapsed time) while waiting
+        feedback = SegmentAllItems.Feedback()
+        while (
+            rclpy.ok() 
+            and not goal_handle.is_cancel_requested()
+            and not vision_pipeline_task.done()
+        ):
+            feedback.elapsed_time = ((self.get_clock().now() - starting_time).nanoseconds / 
+                                     1e9).to_msg()
+            goal_handle.publish_feedback(feedback) 
+            rate.sleep()
+
+        # If there is a cancel request, cancel the vision pipeline task
+        if goal_handle.is_cancel_requested():
+            self.get_logger().info("Goal cancelled.")
+            goal_handle.canceled()
+            result = SegmentAllItems.Result()
+            result.status = result.STATUS_CANCELLED
+
+            # Clear the active goal
+            with self.active_goal_request_lock:
+                self.active_goal_request = None
+
+            return result
+        
+        # Set the result after the task has been completed        
+        self.get_logger().info("Goal not cancelled.")
+        self.get_logger().info("VIsion pipeline completed successfully.")
+        result = vision_pipeline_task.result()
+        goal_handle.succeed()
+        result.status = result.STATUS_SUCCEEDED
+
+        # Clear the active goal
+        with self.active_goal_request_lock:
+            self.active_goal_request = None
+
+        return result
+
 
     
 def main(args=None):

From 5d81216f34e07a7094a6a6ea3178a062eb44b853 Mon Sep 17 00:00:00 2001
From: SoundCloudRapper365 <sriramku@outlook.com>
Date: Wed, 14 Aug 2024 11:49:51 -0700
Subject: [PATCH 05/42] created result message returned by vision pipeline

---
 .../segment_all_items.py                      | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index bc2cad4a..0c10f536 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -650,7 +650,7 @@ def run_grounding_dino(
 
         return bbox_predictions
     
-    async def run_vision_pipeline(self, image: npt.NDArray, caption: str):
+    async def run_vision_pipeline(self, image_msg: Image, caption: str):
         """
         Run the vision pipeline consisting of GroundingDINO and EfficientSAM on the image.
         The caption and latest image are prompted to GroundingDINO which outputs bounding boxes
@@ -671,6 +671,26 @@ async def run_vision_pipeline(self, image: npt.NDArray, caption: str):
                           detected from running the GroundingDINO + EfficientSAM vision pipeline
                           on the image.
         """
+        self.get_logger().info("Running the vision pipeline...")
+        # Define the result and create result message header
+        result = SegmentAllItems.Result()
+        result.header = image_msg.header
+        with self.camera_info_lock:
+            if self.camera_info is not None:
+                result.camera_info = self.camera_info
+            else:
+                self.get_logger().warn(
+                    "Camera info not received, not including in result message"
+                )
+
+        # Get the latest depth image and convert the depth image to OpenCV format
+        with self.latest_depth_img_msg_lock:
+            depth_img_msg = self.latest_depth_img_msg
+        depth_img = ros_msg_to_cv2_image(depth_img_msg, self.bridge)
+        
+        # Convert the image to OpenCV format
+        image = ros_msg_to_cv2_image(image_msg, self.bridge)
+
         # Run Open-GroundingDINO on the image
         bbox_predictions = self.run_grounding_dino(image, caption, self.box_threshold, self.text_threshold)
 
@@ -681,6 +701,7 @@ async def run_vision_pipeline(self, image: npt.NDArray, caption: str):
             for box in boxes:
                 masks, scores = self.run_efficient_sam(image, None, box, 1)
                 if len(masks) > 0:
+                    mask_msg = self.generate_mask_msg()
                     mask_predictions[phrase] = masks[0]
                 break
             

From bfcaeaae1a4fc4e3b505ff26574d75232cd66985 Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Wed, 14 Aug 2024 13:05:25 -0700
Subject: [PATCH 06/42] modified launch file and created yaml file for
 parameters

---
 .../segment_all_items.py                      | 83 +++++++++++++++----
 .../launch/ada_feeding_perception.launch.py   | 17 ++++
 2 files changed, 84 insertions(+), 16 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 0c10f536..b00a7fcd 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -287,7 +287,7 @@ def read_params(
                 ),
                 (
                     "box_threshold",
-                    30.0,
+                    0.30,
                     ParameterDescriptor(
                         name="box_threshold",
                         type=ParameterType.PARAMETER_DOUBLE,
@@ -298,7 +298,7 @@ def read_params(
                 ),
                 (
                     "text_threshold",
-                    25.0,
+                    0.25,
                     ParameterDescriptor(
                         name="text_threshold",
                         type=ParameterType.PARAMETER_DOUBLE,
@@ -485,6 +485,42 @@ def image_callback(self, msg: Union[Image, CompressedImage]) -> None:
         """
         with self.latest_img_msg_lock:
             self.latest_img_msg = msg
+    
+    def goal_callback(self, goal_request: SegmentAllItems.Goal) -> GoalResponse:
+        """
+        """
+        # If no RGB image is received, reject the goal request
+        with self.latest_img_msg_lock:
+            if self.latest_img_msg is None:
+                self.get_logger().info(
+                    "Rejecting goal request because no color image was received"
+                )
+                return GoalResponse.REJECT
+
+        # If no depth image is received, reject the goal request
+        with self.latest_depth_img_msg_lock:
+            if self.latest_depth_img_msg is None:
+                self.get_logger().info(
+                    "Rejecting goal request because no depth image was received"
+                )
+                return GoalResponse.REJECT
+        
+        # Accept the goal request is there isn't already an active one,
+        # otherwise reject it
+        with self.active_goal_request_lock:
+            if self.active_goal_request is None:
+                self.get_logger().info("Accepting goal request")
+                self.active_goal_request = goal_request 
+                return GoalResponse.ACCEPT
+            self.get_logger().info(
+                "Rejecting goal request because there is already an active one"
+            )
+            return GoalResponse.REJECT
+    
+    def cancel_callback(self) -> CancelResponse:
+        """
+        """
+        return CancelResponse.ACCEPT
 
     def run_sam(
         self, 
@@ -650,6 +686,21 @@ def run_grounding_dino(
 
         return bbox_predictions
     
+    def generate_mask_msg(
+        self, 
+        item_id: str, 
+        score: float, 
+        mask:npt.NDArray[np.bool_], 
+        image: npt.NDArray, 
+        depth_img: npt.NDArray, 
+        bbox: Tuple[int, int, int, int],
+    ) -> Optional[Mask]:
+        """
+        """
+
+        mask_msg = Mask()
+        return mask_msg
+    
     async def run_vision_pipeline(self, image_msg: Image, caption: str):
         """
         Run the vision pipeline consisting of GroundingDINO and EfficientSAM on the image.
@@ -667,9 +718,7 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
 
         Returns
         -------
-        mask_predictions: A dictionary containing the pixel-wise masks for each food item label 
-                          detected from running the GroundingDINO + EfficientSAM vision pipeline
-                          on the image.
+        
         """
         self.get_logger().info("Running the vision pipeline...")
         # Define the result and create result message header
@@ -694,18 +743,24 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         # Run Open-GroundingDINO on the image
         bbox_predictions = self.run_grounding_dino(image, caption, self.box_threshold, self.text_threshold)
 
-        # Get the top contender mask for each food item label detected by 
-        # GroundingDINO using EfficientSAM
-        mask_predictions = {}
+        # Collect the top contender mask for each food item label detected by 
+        # GroundingDINO using EfficientSAM and create dictionary of mask 
+        # predictions from the pipeline 
+        detected_items = []
+        item_labels = []
         for phrase, boxes in bbox_predictions.items():
             for box in boxes:
                 masks, scores = self.run_efficient_sam(image, None, box, 1)
                 if len(masks) > 0:
-                    mask_msg = self.generate_mask_msg()
-                    mask_predictions[phrase] = masks[0]
+                    mask_msg = self.generate_mask_msg(
+                        phrase, scores[0], masks[0], image, depth_img, box
+                    )
+                    detected_items.append(mask_msg)
+                    item_labels.append(phrase)
                 break
-            
-        return mask_predictions 
+        result.detected_items = detected_items
+        result.item_labels = item_labels    
+        return result 
 
     async def execute_callback(
         self, goal_handle: ServerGoalHandle
@@ -804,7 +859,3 @@ def main(args=None):
 
 if __name__ == "__main__":
     main()
-
-
-        
-
diff --git a/ada_feeding_perception/launch/ada_feeding_perception.launch.py b/ada_feeding_perception/launch/ada_feeding_perception.launch.py
index d625a352..a2126924 100755
--- a/ada_feeding_perception/launch/ada_feeding_perception.launch.py
+++ b/ada_feeding_perception/launch/ada_feeding_perception.launch.py
@@ -91,6 +91,23 @@ def generate_launch_description():
     )
     launch_description.add_action(segment_from_point)
 
+    # Load the segment all items node
+    segment_all_items_config = os.path.join(
+        ada_feeding_perception_share_dir, "config", "segment_all_items.yaml"
+    )
+    segment_all_items_params = {}
+    segment_all_items_params["model_dir"] = ParameterValue(
+        os.path.join(ada_feeding_perception_share_dir, "model"), value_type=str
+    )
+    segment_all_items = Node(
+        package="ada_feeding_perception",
+        name="segment_all_items",
+        executable="segment_all_items",
+        parameters=[segment_all_items_config, segment_all_items_params],
+        remappings=realsense_remappings + aligned_depth_remapping,
+    )
+    launch_description.add_action(segment_all_items)
+
     # Load the face detection node
     face_detection_config = os.path.join(
         ada_feeding_perception_share_dir, "config", "face_detection.yaml"

From 19e92759e0cd359e09f0385580febed2164bc3c8 Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Fri, 16 Aug 2024 10:46:05 -0700
Subject: [PATCH 07/42] updated setup.py and modified parameters

---
 .../segment_all_items.py                      |  2 +-
 .../config/segment_all_items.yaml             | 28 +++++++++++++++++++
 ada_feeding_perception/setup.py               |  1 +
 3 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 ada_feeding_perception/config/segment_all_items.yaml

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index b00a7fcd..57d774f3 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -241,7 +241,7 @@ def read_params(
                         description="The name of the configuration file to use for Open-GroundingDINO",
                         read_only=True,
                     ),
-                )
+                ),
                 (
                     "groundingdino_model_path",
                     None,
diff --git a/ada_feeding_perception/config/segment_all_items.yaml b/ada_feeding_perception/config/segment_all_items.yaml
new file mode 100644
index 00000000..8bee688f
--- /dev/null
+++ b/ada_feeding_perception/config/segment_all_items.yaml
@@ -0,0 +1,28 @@
+# NOTE: You have to change this node name if you change the node name in the launchfile.
+segment_all_items:
+  ros_parameters:
+    # The name of the Segment Anything model to use
+    sam_model_name: sam_vit_b_01ec64.pth
+    # The URL to download the model checkpoint from if it is not already downloaded
+    sam_model_base_url: "https://dl.fbaipublicfiles.com/segment_anything/"
+
+    # The name of the Efficient Segment Anything model checkpoint to use
+    efficient_sam_model_name: efficient_sam_vitt.pt
+    # The URL to download the model checkpoint from if it is not already downloaded
+    efficient_sam_model_base_url: "https://raw.githubusercontent.com/yformer/EfficientSAM/main/weights/"
+
+    # The path to the configuration file for GroundingDINO
+    groundingdino_config_path: config/GroundingDINO_SwinT_OGC.py
+    # The name of the GroundingDINO model checkpoint to use
+    groundingdino_model_path: groundingdino_swint_ogc.pth
+
+    # Whether to use SAM or EfficientSAM
+    use_efficient_sam: true
+
+    # The rate (hz) at which to return feedback
+    rate_hz: 10.0
+
+    # The threshold for bounding box detections by GroundingDINO
+    box_threshold: 0.30
+    # The threshold for text detections by GroundingDINO
+    text_threshold: 0.25
diff --git a/ada_feeding_perception/setup.py b/ada_feeding_perception/setup.py
index 3443d38d..8d59bf16 100644
--- a/ada_feeding_perception/setup.py
+++ b/ada_feeding_perception/setup.py
@@ -49,6 +49,7 @@
             "food_on_fork_detection = ada_feeding_perception.food_on_fork_detection:main",
             "republisher = ada_feeding_perception.republisher:main",
             "segment_from_point = ada_feeding_perception.segment_from_point:main",
+            "segment_all_items = ada_feeding_perception.segment_all_items:main",
             "test_segment_from_point = ada_feeding_perception.test_segment_from_point:main",
             "face_detection = ada_feeding_perception.face_detection:main",
             "table_detection = ada_feeding_perception.table_detection:main",

From 4e7391f5be2d8090cc28cab9cfec2999c653f773 Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Fri, 16 Aug 2024 14:15:04 -0700
Subject: [PATCH 08/42] added requirements to install and fixed imports

---
 .../ada_feeding_perception/segment_all_items.py          | 9 +++------
 ada_feeding_perception/config/segment_all_items.yaml     | 2 +-
 requirements.txt                                         | 1 +
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 57d774f3..60cd84ba 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -26,13 +26,13 @@
 from segment_anything import sam_model_registry, SamPredictor
 from groundingdino.models import build_model
 from groundingdino.util.slconfig import SLConfig
-from groundingdino.util.utils import get_phrases_from_posmap, clean_slate_dict
+from groundingdino.util.utils import get_phrases_from_posmap, clean_state_dict
 from sensor_msgs.msg import CameraInfo, CompressedImage, Image, RegionOfInterest
 import torch
 from torchvision import transforms
 
 # Local imports
-from ada_feeding_perception.action import SegmentAllItems
+from ada_feeding_msgs.action import SegmentAllItems
 from ada_feeding_msgs.msg import Mask
 from ada_feeding_perception.helpers import (
     bbox_from_mask,
@@ -43,9 +43,6 @@
     get_img_msg_type,
     ros_msg_to_cv2_image,
 )
-from ada_feeding_perception.segment_from_point import (
-    generate_mask_msg,
-)
 
 class SegmentAllItemsNode(Node):
     """
@@ -372,7 +369,7 @@ def initialize_grounding_dino(
         # Load the GroundingDINO model checkpoint
         checkpoint = torch.load(groundingdino_model_path, map_location="cpu")
         load_log = groundingdino.load_state_dict(
-            clean_slate_dict(checkpoint["model"]), strict=False
+            clean_state_dict(checkpoint["model"]), strict=False
         )
         self.get_logger().info(f"Loaded model checkpoint: {load_log}")
         _ = groundingdino.eval()
diff --git a/ada_feeding_perception/config/segment_all_items.yaml b/ada_feeding_perception/config/segment_all_items.yaml
index 8bee688f..1d8bd18c 100644
--- a/ada_feeding_perception/config/segment_all_items.yaml
+++ b/ada_feeding_perception/config/segment_all_items.yaml
@@ -1,6 +1,6 @@
 # NOTE: You have to change this node name if you change the node name in the launchfile.
 segment_all_items:
-  ros_parameters:
+  ros__parameters:
     # The name of the Segment Anything model to use
     sam_model_name: sam_vit_b_01ec64.pth
     # The URL to download the model checkpoint from if it is not already downloaded
diff --git a/requirements.txt b/requirements.txt
index ad48ff3a..f8f6fc95 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ sounddevice
 scikit-spatial
 git+https://github.com/facebookresearch/segment-anything.git
 git+https://github.com/yformer/EfficientSAM.git
+git+https://github.com/IDEA-Research/GroundingDINO.git

From 9c43fc4e1c5b1aa95e41f4e81a8bf72b3f0b6eeb Mon Sep 17 00:00:00 2001
From: SoundCloudRapper365 <sriramku@outlook.com>
Date: Thu, 12 Sep 2024 10:37:46 -0700
Subject: [PATCH 09/42] changed grounding dino path and added checkpoint

---
 ada_feeding_perception/config/segment_all_items.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ada_feeding_perception/config/segment_all_items.yaml b/ada_feeding_perception/config/segment_all_items.yaml
index 1d8bd18c..2a7aa29b 100644
--- a/ada_feeding_perception/config/segment_all_items.yaml
+++ b/ada_feeding_perception/config/segment_all_items.yaml
@@ -14,7 +14,7 @@ segment_all_items:
     # The path to the configuration file for GroundingDINO
     groundingdino_config_path: config/GroundingDINO_SwinT_OGC.py
     # The name of the GroundingDINO model checkpoint to use
-    groundingdino_model_path: groundingdino_swint_ogc.pth
+    groundingdino_model_path: groundingdino_checkpoint.pth
 
     # Whether to use SAM or EfficientSAM
     use_efficient_sam: true

From 31551e4a30b83c2947cbedb9b228f4a52c61fcbd Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Thu, 12 Sep 2024 16:53:02 -0700
Subject: [PATCH 10/42] Added config file + fixed image transformations

---
 .../segment_all_items.py                      | 54 ++++++++++++++-----
 .../config/segment_all_items.yaml             |  4 +-
 .../model/GroundingDINO_SwinT_OGC.py          | 43 +++++++++++++++
 3 files changed, 87 insertions(+), 14 deletions(-)
 create mode 100644 ada_feeding_perception/model/GroundingDINO_SwinT_OGC.py

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 60cd84ba..f150169a 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -26,10 +26,12 @@
 from segment_anything import sam_model_registry, SamPredictor
 from groundingdino.models import build_model
 from groundingdino.util.slconfig import SLConfig
+import groundingdino.datasets.transforms as T
 from groundingdino.util.utils import get_phrases_from_posmap, clean_state_dict
 from sensor_msgs.msg import CameraInfo, CompressedImage, Image, RegionOfInterest
 import torch
 from torchvision import transforms
+from PIL import Image as ImagePIL
 
 # Local imports
 from ada_feeding_msgs.action import SegmentAllItems
@@ -65,8 +67,8 @@ def __init__(self):
         (
             seg_model_name,
             seg_model_base_url,
-            groundingdino_config_path,
-            groundingdino_model_path,
+            groundingdino_config_name,
+            groundingdino_model_name,
             model_dir,
             self.use_efficient_sam,
             self.rate_hz,
@@ -83,6 +85,10 @@ def __init__(self):
             download_checkpoint(seg_model_name, model_dir, seg_model_base_url)
             self.get_logger().info(f"Model checkpoint downloaded {seg_model_path}.")
 
+        # Set the path to GroundingDINO and its config file in the model directory 
+        groundingdino_model_path = os.path.join(model_dir, groundingdino_model_name)
+        groundingdino_config_path = os.path.join(model_dir, groundingdino_config_name)
+
         # Subscribe to the camera info topic, to get the camera intrinsics
         self.camera_info = None
         self.camera_info_lock = threading.Lock()
@@ -145,6 +151,11 @@ def __init__(self):
         # Convert between ROS and CV images
         self.bridge = CvBridge()
 
+        # Create the shared resource to ensure that the action server rejects all
+        # goals while a goal is currently active.
+        self.active_goal_request_lock = threading.Lock()
+        self.active_goal_request = None
+
         # Create the Action Server.
         # Note: remapping action names does not work: https://github.com/ros2/ros2/issues/1312
         self._action_server = ActionServer(
@@ -171,8 +182,8 @@ def read_params(
             sam_model_base_url,
             efficient_sam_model_name,
             efficient_sam_model_base_url,
-            groundingdino_config_path,
-            groundingdino_model_path,
+            groundingdino_config_name,
+            groundingdino_model_name,
             model_dir,
             use_efficient_sam,
             rate_hz,
@@ -230,20 +241,20 @@ def read_params(
                     ),
                 ),
                 (
-                    "groundingdino_config_path",
+                    "groundingdino_config_name",
                     None,
                     ParameterDescriptor(
-                        name="groundingdino_config_path",
+                        name="groundingdino_config_name",
                         type=ParameterType.PARAMETER_STRING,
                         description="The name of the configuration file to use for Open-GroundingDINO",
                         read_only=True,
                     ),
                 ),
                 (
-                    "groundingdino_model_path",
+                    "groundingdino_model_name",
                     None,
                     ParameterDescriptor(
-                        name="groundingdino_model_path",
+                        name="groundingdino_model_name",
                         type=ParameterType.PARAMETER_STRING,
                         description="The name of the model checkpoint to use for Open-GroundingDINO",
                         read_only=True,
@@ -337,8 +348,8 @@ def read_params(
         return (
             seg_model_name,
             seg_model_base_url,
-            groundingdino_config_path.value,
-            groundingdino_model_path.value,
+            groundingdino_config_name.value,
+            groundingdino_model_name.value,
             model_dir.value,
             use_efficient_sam.value,
             rate_hz.value,
@@ -374,6 +385,9 @@ def initialize_grounding_dino(
         self.get_logger().info(f"Loaded model checkpoint: {load_log}")
         _ = groundingdino.eval()
         self.groundingdino = groundingdino
+        self.groundingdino.to(device=self.device)
+
+        self.get_logger().info("...Done!")
 
     # Move bottom two functions to helpers file and import them for 
     # both segment_all_items.py and segment_from_point.py    
@@ -654,10 +668,11 @@ def run_grounding_dino(
         caption = caption.lower().strip()
 
         # Run Open-GroundingDINO on the image using the input caption
+        image.to(device=self.device)
         with torch.no_grad():
             outputs = self.groundingdino(
                 image[None],
-                caption=[caption],
+                captions=[caption],
             )
             logits = outputs["pred_logits"].sigmoid()[0]
             boxes = outputs["pred_boxes"][0]
@@ -682,6 +697,20 @@ def run_grounding_dino(
                 bbox_predictions[phrase].append(box.cpu().numpy())
 
         return bbox_predictions
+
+    def load_image(image_array: npt.NDArray):
+        # Convert image to image pillow to apply transformation
+        image_pil = ImagePIL.fromarray(image_array) 
+
+        transform = T.Compose(
+            [
+                T.RandomResize([800], max_size=1333),
+                T.ToTensor(),
+                T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        image, _ = transform(image_pil, None)  # 3, h, w
+        return image_pil, image
     
     def generate_mask_msg(
         self, 
@@ -790,8 +819,9 @@ async def execute_callback(
 
         # Convert the input label list from goal request to a single string caption
         # for GroundingDINO
-        caption = '. '.join(goal_handle.request.input_labels.lower().strip())
+        caption = '. '.join(goal_handle.request.input_labels).lower().strip()
         caption += '.'
+        self.get_logger().info(f"caption: {caption}")
 
         # Start running the vision pipeline as a separate thread
         rate = self.create_rate(self.rate_hz)
diff --git a/ada_feeding_perception/config/segment_all_items.yaml b/ada_feeding_perception/config/segment_all_items.yaml
index 2a7aa29b..63d22431 100644
--- a/ada_feeding_perception/config/segment_all_items.yaml
+++ b/ada_feeding_perception/config/segment_all_items.yaml
@@ -12,9 +12,9 @@ segment_all_items:
     efficient_sam_model_base_url: "https://raw.githubusercontent.com/yformer/EfficientSAM/main/weights/"
 
     # The path to the configuration file for GroundingDINO
-    groundingdino_config_path: config/GroundingDINO_SwinT_OGC.py
+    groundingdino_config_name: GroundingDINO_SwinT_OGC.py
     # The name of the GroundingDINO model checkpoint to use
-    groundingdino_model_path: groundingdino_checkpoint.pth
+    groundingdino_model_name: groundingdino_checkpoint.pth
 
     # Whether to use SAM or EfficientSAM
     use_efficient_sam: true
diff --git a/ada_feeding_perception/model/GroundingDINO_SwinT_OGC.py b/ada_feeding_perception/model/GroundingDINO_SwinT_OGC.py
new file mode 100644
index 00000000..9158d5f6
--- /dev/null
+++ b/ada_feeding_perception/model/GroundingDINO_SwinT_OGC.py
@@ -0,0 +1,43 @@
+batch_size = 1
+modelname = "groundingdino"
+backbone = "swin_T_224_1k"
+position_embedding = "sine"
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+two_stage_type = "standard"
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+transformer_activation = "relu"
+dec_pred_bbox_embed_share = True
+dn_box_noise_scale = 1.0
+dn_label_noise_ratio = 0.5
+dn_label_coef = 1.0
+dn_bbox_coef = 1.0
+embed_init_tgt = True
+dn_labelbook_size = 2000
+max_text_len = 256
+text_encoder_type = "bert-base-uncased"
+use_text_enhancer = True
+use_fusion_layer = True
+use_checkpoint = True
+use_transformer_ckpt = True
+use_text_cross_attention = True
+text_dropout = 0.0
+fusion_dropout = 0.0
+fusion_droppath = 0.1
+sub_sentence_present = True

From 3ec7b5048ae75cbdb3ffc802ce8ba1706d5814ef Mon Sep 17 00:00:00 2001
From: SoundCloudRapper365 <sriramku@outlook.com>
Date: Fri, 13 Sep 2024 18:24:27 -0700
Subject: [PATCH 11/42] Added GroundingDINO visualization function

---
 .../segment_all_items.py                      | 59 +++++++++++++++++--
 .../config/segment_all_items.yaml             |  4 ++
 2 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index f150169a..9f92336b 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -76,6 +76,7 @@ def __init__(self):
             self.text_threshold,
             self.min_depth_mm,
             self.max_depth_mm,
+            self.viz_groundingdino,
         ) = self.read_params()
 
         # Download the checkpoint for SAM/EfficientSAM if it doesn't exist
@@ -168,6 +169,13 @@ def __init__(self):
             callback_group=MutuallyExclusiveCallbackGroup(),
         )
 
+        # If the GroundingDINO results visualization flage is set, then a publisher
+        # is created to visualize the bounding box predictions of GroundingDINO
+        if self.viz_groundingdino:
+            self.viz_groundingdino_pub = self.create_publisher(
+                Image, "~/plate_detection_img", 1
+            )
+
     def read_params(
         self,
     ) -> Tuple[Parameter, Parameter, Parameter, Parameter, Parameter]:
@@ -191,6 +199,7 @@ def read_params(
             text_threshold,
             min_depth_mm,
             max_depth_mm,
+            viz_groundingdino,
         ) = self.declare_parameters(
             "",
             [
@@ -335,6 +344,17 @@ def read_params(
                         read_only=True,
                     ),
                 ),
+                (
+                    "viz_groundingdino",
+                    True,
+                    ParameterDescriptor(
+                        name="viz_groundingdino",
+                        type=ParameterType.PARAMETER_BOOL,
+                        description="Whether to visualize the bounding box" 
+                                    + "predictions of GroundingDINO.",
+                        read_only=True,
+                    ),
+                )
             ],
         )
 
@@ -357,6 +377,7 @@ def read_params(
             text_threshold.value,
             min_depth_mm.value,
             max_depth_mm.value,
+            viz_groundingdino.value,
         )
     
     def initialize_grounding_dino(
@@ -661,8 +682,8 @@ def run_grounding_dino(
         """
         self.get_logger().info("Running Open-GroundingDINO...")
 
-        # Convert image from BGR to RGB for Open-GroundingDINO
-        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Convert image to Image pillow
+        image_pil = self.load_image(image)
 
         # Lowercase and strip the caption
         caption = caption.lower().strip()
@@ -671,7 +692,7 @@ def run_grounding_dino(
         image.to(device=self.device)
         with torch.no_grad():
             outputs = self.groundingdino(
-                image[None],
+                image_pil[None],
                 captions=[caption],
             )
             logits = outputs["pred_logits"].sigmoid()[0]
@@ -723,10 +744,38 @@ def generate_mask_msg(
     ) -> Optional[Mask]:
         """
         """
-
+        
         mask_msg = Mask()
         return mask_msg
     
+    def visualize_groundingdino_results(self, image: Image, predictions: dict):
+        """
+        Visualizes the bounding box predictions of GroundingDINO and then 
+        publishes the image as a ROS message.
+
+        Parameters
+        ----------
+        image: The image to visualize.
+        predictions: The bounding box predictions of GroundingDINO.
+        """
+        
+        for phrase, boxes in predictions.items():
+            for box in boxes:
+                # Visualize bounding box
+                x0, y0, x1, y1 = box
+                color = (0, 255, 0)
+                thickness = 6
+                cv2.rectangle(image, (x0, y0), (x1, y1), color, thickness)
+
+                # Display text label below bounding box
+                height, _, _ = image.shape()
+                cv2.putText(image, phrase, (x0, y0 - 12), 0, 1e-3 * height, color, thickness // 3)
+
+        # Publish the image
+        self.viz_groundingdino_pub.publish(
+            cv2_image_to_ros_msg(image, compress=False, bridge=self.bridge)
+        )
+    
     async def run_vision_pipeline(self, image_msg: Image, caption: str):
         """
         Run the vision pipeline consisting of GroundingDINO and EfficientSAM on the image.
@@ -763,7 +812,7 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
             depth_img_msg = self.latest_depth_img_msg
         depth_img = ros_msg_to_cv2_image(depth_img_msg, self.bridge)
         
-        # Convert the image to OpenCV format
+        # Convert the image to OpenCV format 
         image = ros_msg_to_cv2_image(image_msg, self.bridge)
 
         # Run Open-GroundingDINO on the image
diff --git a/ada_feeding_perception/config/segment_all_items.yaml b/ada_feeding_perception/config/segment_all_items.yaml
index 63d22431..cf903029 100644
--- a/ada_feeding_perception/config/segment_all_items.yaml
+++ b/ada_feeding_perception/config/segment_all_items.yaml
@@ -26,3 +26,7 @@ segment_all_items:
     box_threshold: 0.30
     # The threshold for text detections by GroundingDINO
     text_threshold: 0.25
+
+    # A boolean to determine whether to visualize the bounding box predictions 
+    # made by GroundingDINO
+    viz_groundingdino: true

From 9dc9a40a3568f91c9d72a45917848b42f8530586 Mon Sep 17 00:00:00 2001
From: SoundCloudRapper365 <sriramku@outlook.com>
Date: Mon, 16 Sep 2024 13:40:37 -0700
Subject: [PATCH 12/42] created GroundingDINO publisher for testing

---
 .../ada_feeding_perception/segment_all_items.py    | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 9f92336b..aa1ffcf6 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -173,7 +173,7 @@ def __init__(self):
         # is created to visualize the bounding box predictions of GroundingDINO
         if self.viz_groundingdino:
             self.viz_groundingdino_pub = self.create_publisher(
-                Image, "~/plate_detection_img", 1
+                Image, "~/groundingdino_detection", 1
             )
 
     def read_params(
@@ -818,9 +818,15 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         # Run Open-GroundingDINO on the image
         bbox_predictions = self.run_grounding_dino(image, caption, self.box_threshold, self.text_threshold)
 
+        # Publish a visualization of the GroundingDINO predictions, if the visualization
+        # flag is set to true 
+        if self.viz_groundingdino:
+            self.visualize_groundingdino_results(image, bbox_predictions)
+
         # Collect the top contender mask for each food item label detected by 
         # GroundingDINO using EfficientSAM and create dictionary of mask 
-        # predictions from the pipeline 
+        # predictions from the pipeline
+        """
         detected_items = []
         item_labels = []
         for phrase, boxes in bbox_predictions.items():
@@ -834,7 +840,9 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
                     item_labels.append(phrase)
                 break
         result.detected_items = detected_items
-        result.item_labels = item_labels    
+        result.item_labels = item_labels
+        """ 
+            
         return result 
 
     async def execute_callback(

From 929e570bc8497fee531f7b3b68018e1c9cd678a3 Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Mon, 16 Sep 2024 16:57:34 -0700
Subject: [PATCH 13/42] added more testing code for bbox visualization

---
 .../action/SegmentAllItems.action             |  2 +-
 .../segment_all_items.py                      | 24 ++++++++++++-------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/ada_feeding_msgs/action/SegmentAllItems.action b/ada_feeding_msgs/action/SegmentAllItems.action
index 3f0e7d0b..f91d8c1b 100644
--- a/ada_feeding_msgs/action/SegmentAllItems.action
+++ b/ada_feeding_msgs/action/SegmentAllItems.action
@@ -2,7 +2,7 @@
 # the masks of all segmented items within that image.
 
 # The list of input semantic labels for the food items on the plate
-string[] input_labels
+string[] input_labels   
 ---
 # Possible return statuses
 uint8 STATUS_SUCCEEDED=0
diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index aa1ffcf6..a8c7c9c0 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -680,23 +680,24 @@ def run_grounding_dino(
         bbox_predictions: A dictionary containing the bounding boxes for each food item label 
                         detected from the image.
         """
-        self.get_logger().info("Running Open-GroundingDINO...")
+        self.get_logger().info("Running GroundingDINO...")
 
         # Convert image to Image pillow
-        image_pil = self.load_image(image)
+        image_pil, image_transformed = self.load_image(image)
 
         # Lowercase and strip the caption
         caption = caption.lower().strip()
 
         # Run Open-GroundingDINO on the image using the input caption
-        image.to(device=self.device)
+        image_transformed.to(device=self.device)
         with torch.no_grad():
             outputs = self.groundingdino(
-                image_pil[None],
+                image_transformed[None],
                 captions=[caption],
             )
             logits = outputs["pred_logits"].sigmoid()[0]
             boxes = outputs["pred_boxes"][0]
+        self.get_logger().info("... Done")
         
         # Filter the output based on the box and text thresholds
         bbox_predictions = {}
@@ -715,11 +716,15 @@ def run_grounding_dino(
             # Predict phrases based on the bounding boxes and the text threshold
             phrase = get_phrases_from_posmap(logit > text_threshold, caption_tokens, tokenizer)
             if logit.max() > text_threshold:
+                if phrase not in bbox_predictions:
+                    bbox_predictions[phrase] = []
                 bbox_predictions[phrase].append(box.cpu().numpy())
 
+        self.get_logger().info(f"Predictions: {bbox_predictions}")
+
         return bbox_predictions
 
-    def load_image(image_array: npt.NDArray):
+    def load_image(self, image_array: npt.NDArray):
         # Convert image to image pillow to apply transformation
         image_pil = ImagePIL.fromarray(image_array) 
 
@@ -765,12 +770,15 @@ def visualize_groundingdino_results(self, image: Image, predictions: dict):
                 x0, y0, x1, y1 = box
                 color = (0, 255, 0)
                 thickness = 6
-                cv2.rectangle(image, (x0, y0), (x1, y1), color, thickness)
+                image = cv2.rectangle(image, (x0, y0), (x1, y1), color, thickness)
 
                 # Display text label below bounding box
                 height, _, _ = image.shape()
-                cv2.putText(image, phrase, (x0, y0 - 12), 0, 1e-3 * height, color, thickness // 3)
+                image = cv2.putText(image, phrase, (x0, y0 - 12), 0, 1e-3 * height, color, thickness // 3)
 
+        cv2.imshow('image', image)
+        cv2.waitKey(0)
+        cv2.destroyAllWindows()
         # Publish the image
         self.viz_groundingdino_pub.publish(
             cv2_image_to_ros_msg(image, compress=False, bridge=self.bridge)
@@ -787,7 +795,7 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
 
         Parameters
         ----------
-        image: The image to segment, in BGR.
+        image: The image to segment, as a ROS image message.
         caption: The caption to use for GroundingDINO containing all the food items 
                  detected in the image.
 

From e1ebf8b967bfc650818a0c288a16dff2294ada24 Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Wed, 18 Sep 2024 15:56:56 -0700
Subject: [PATCH 14/42] fixed groundingdino results visualization

---
 .../segment_all_items.py                      | 52 +++++++++++++------
 .../config/segment_all_items.yaml             |  2 +-
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index a8c7c9c0..2b0e61cc 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -11,6 +11,7 @@
 
 # Third-party imports
 import cv2
+import time 
 from cv_bridge import CvBridge
 from efficient_sam.efficient_sam import build_efficient_sam
 import numpy as np
@@ -399,14 +400,13 @@ def initialize_grounding_dino(
         groundingdino = build_model(config_args)
 
         # Load the GroundingDINO model checkpoint
-        checkpoint = torch.load(groundingdino_model_path, map_location="cpu")
+        checkpoint = torch.load(groundingdino_model_path, map_location=self.device)
         load_log = groundingdino.load_state_dict(
             clean_state_dict(checkpoint["model"]), strict=False
         )
         self.get_logger().info(f"Loaded model checkpoint: {load_log}")
         _ = groundingdino.eval()
-        self.groundingdino = groundingdino
-        self.groundingdino.to(device=self.device)
+        self.groundingdino = groundingdino.to(device=self.device)
 
         self.get_logger().info("...Done!")
 
@@ -662,8 +662,8 @@ def run_grounding_dino(
         self, 
         image: npt.NDArray, 
         caption: str, 
-        box_threshold: int, 
-        text_threshold: int, 
+        box_threshold: float, 
+        text_threshold: float, 
     ):
         """
         Run Open-GroundingDINO on the image.
@@ -681,6 +681,7 @@ def run_grounding_dino(
                         detected from the image.
         """
         self.get_logger().info("Running GroundingDINO...")
+        inference_time = time.time()
 
         # Convert image to Image pillow
         image_pil, image_transformed = self.load_image(image)
@@ -689,14 +690,15 @@ def run_grounding_dino(
         caption = caption.lower().strip()
 
         # Run Open-GroundingDINO on the image using the input caption
-        image_transformed.to(device=self.device)
+        image_transformed = image_transformed.to(device=self.device)
+        self.get_logger().info(f"device: {self.device}")
         with torch.no_grad():
             outputs = self.groundingdino(
                 image_transformed[None],
                 captions=[caption],
             )
-            logits = outputs["pred_logits"].sigmoid()[0]
-            boxes = outputs["pred_boxes"][0]
+        logits = outputs["pred_logits"].sigmoid()[0]
+        boxes = outputs["pred_boxes"][0]
         self.get_logger().info("... Done")
         
         # Filter the output based on the box and text thresholds
@@ -707,6 +709,10 @@ def run_grounding_dino(
         logits_filt = logits_filt[filt_thresh_mask]
         boxes_filt = boxes_filt[filt_thresh_mask]
 
+        self.get_logger().info(f"Caption: {caption}")
+        self.get_logger().info(f"Boxes: {boxes_filt}")
+        self.get_logger().info(f"Logits: {logits_filt}")
+
         # Tokenize the caption
         tokenizer = self.groundingdino.tokenizer
         caption_tokens = tokenizer(caption)
@@ -715,13 +721,17 @@ def run_grounding_dino(
         for logit, box in zip(logits_filt, boxes_filt):
             # Predict phrases based on the bounding boxes and the text threshold
             phrase = get_phrases_from_posmap(logit > text_threshold, caption_tokens, tokenizer)
-            if logit.max() > text_threshold:
-                if phrase not in bbox_predictions:
-                    bbox_predictions[phrase] = []
-                bbox_predictions[phrase].append(box.cpu().numpy())
+            self.get_logger().info(f"logit: {logit}, box: {box}")
+            self.get_logger().info(f"{phrase}")
+            if phrase not in bbox_predictions:
+                bbox_predictions[phrase] = []
+            bbox_predictions[phrase].append(box.cpu().numpy())
 
         self.get_logger().info(f"Predictions: {bbox_predictions}")
 
+        inference_time = int(round((time.time() - inference_time) * 1000))
+        self.get_logger().info(f"Approx. Inference Time: {inference_time}")
+
         return bbox_predictions
 
     def load_image(self, image_array: npt.NDArray):
@@ -763,17 +773,27 @@ def visualize_groundingdino_results(self, image: Image, predictions: dict):
         image: The image to visualize.
         predictions: The bounding box predictions of GroundingDINO.
         """
+        # Define height and width of image
+        height, width, _ = image.shape
+        self.get_logger().info(f"height, width: {height}, {width}")
         
         for phrase, boxes in predictions.items():
             for box in boxes:
-                # Visualize bounding box
-                x0, y0, x1, y1 = box
+                # Scale the box from percentage values to pixel values
+                box = np.multiply(box, np.array([width, height, width, height]))
+                center_x, center_y, w, h = box
+                # Get the bottom left and top right coordinates of the box
+                x0 = center_x - (w / 2)
+                y0 = center_y - (h / 2)
+                x1 = x0 + w
+                y1 = y0 + h
+                x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+                self.get_logger().info(f"box: {x0}, {y0}, {x1}, {y1}")
                 color = (0, 255, 0)
                 thickness = 6
                 image = cv2.rectangle(image, (x0, y0), (x1, y1), color, thickness)
 
                 # Display text label below bounding box
-                height, _, _ = image.shape()
                 image = cv2.putText(image, phrase, (x0, y0 - 12), 0, 1e-3 * height, color, thickness // 3)
 
         cv2.imshow('image', image)
@@ -884,7 +904,7 @@ async def execute_callback(
 
         # Convert the input label list from goal request to a single string caption
         # for GroundingDINO
-        caption = '. '.join(goal_handle.request.input_labels).lower().strip()
+        caption = ' . '.join(goal_handle.request.input_labels).lower().strip()
         caption += '.'
         self.get_logger().info(f"caption: {caption}")
 
diff --git a/ada_feeding_perception/config/segment_all_items.yaml b/ada_feeding_perception/config/segment_all_items.yaml
index cf903029..0dc77823 100644
--- a/ada_feeding_perception/config/segment_all_items.yaml
+++ b/ada_feeding_perception/config/segment_all_items.yaml
@@ -14,7 +14,7 @@ segment_all_items:
     # The path to the configuration file for GroundingDINO
     groundingdino_config_name: GroundingDINO_SwinT_OGC.py
     # The name of the GroundingDINO model checkpoint to use
-    groundingdino_model_name: groundingdino_checkpoint.pth
+    groundingdino_model_name: groundingdino_swint_ogc.pth
 
     # Whether to use SAM or EfficientSAM
     use_efficient_sam: true

From 704caa151c3b83d27f1532bbce53345961dd76e5 Mon Sep 17 00:00:00 2001
From: SoundCloudRapper365 <sriramku@outlook.com>
Date: Thu, 19 Sep 2024 11:02:46 -0700
Subject: [PATCH 15/42] corrected image preprocessing?

---
 .../ada_feeding_perception/segment_all_items.py               | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 2b0e61cc..14532900 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -670,7 +670,7 @@ def run_grounding_dino(
 
         Parameters
         ----------
-        image: The image to retrieve semantically labeled bounding boxes from, in BGR.
+        image: The CV2 image to retrieve semantically labeled bounding boxes from.
         caption: The caption to use for Open-GroundingDINO.
         box_threshold: The threshold for the bounding box.
         text_threshold: The threshold for the text.
@@ -736,7 +736,7 @@ def run_grounding_dino(
 
     def load_image(self, image_array: npt.NDArray):
         # Convert image to image pillow to apply transformation
-        image_pil = ImagePIL.fromarray(image_array) 
+        image_pil = ImagePIL.fromarray(image_array, mode="RGB")
 
         transform = T.Compose(
             [

From 4f9305d36f8eac99b5863ee7c1dbf7f021833565 Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Mon, 23 Sep 2024 12:57:47 -0700
Subject: [PATCH 16/42] groundingdino works!

---
 .../segment_all_items.py                      | 71 ++++++++++++-------
 1 file changed, 47 insertions(+), 24 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 14532900..7ba1dc00 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -549,9 +549,10 @@ def goal_callback(self, goal_request: SegmentAllItems.Goal) -> GoalResponse:
             )
             return GoalResponse.REJECT
     
-    def cancel_callback(self) -> CancelResponse:
+    def cancel_callback(self_: ServerGoalHandle) -> CancelResponse:
         """
         """
+        self.get_logger().info("Cancelling the goal request...")
         return CancelResponse.ACCEPT
 
     def run_sam(
@@ -681,6 +682,9 @@ def run_grounding_dino(
                         detected from the image.
         """
         self.get_logger().info("Running GroundingDINO...")
+        
+        # Set the initial time to measure the elapsed time running GroundingDINO on the 
+        # desired image and text prompts.
         inference_time = time.time()
 
         # Convert image to Image pillow
@@ -702,7 +706,7 @@ def run_grounding_dino(
         self.get_logger().info("... Done")
         
         # Filter the output based on the box and text thresholds
-        bbox_predictions = {}
+        boxes_cxcywh = {}
         logits_filt = logits.cpu().clone()
         boxes_filt = boxes.cpu().clone()
         filt_thresh_mask = logits_filt.max(dim=1)[0] > box_threshold
@@ -723,21 +727,47 @@ def run_grounding_dino(
             phrase = get_phrases_from_posmap(logit > text_threshold, caption_tokens, tokenizer)
             self.get_logger().info(f"logit: {logit}, box: {box}")
             self.get_logger().info(f"{phrase}")
-            if phrase not in bbox_predictions:
-                bbox_predictions[phrase] = []
-            bbox_predictions[phrase].append(box.cpu().numpy())
+            if phrase not in boxes_cxcywh:
+                boxes_cxcywh[phrase] = []
+            boxes_cxcywh[phrase].append(box.cpu().numpy())
+        
 
-        self.get_logger().info(f"Predictions: {bbox_predictions}")
+        # Define height and width of image
+        height, width, _ = image.shape
+        self.get_logger().info(f"height, width: {height}, {width}")
 
+        # Convert the bounding boxes outputted by GroundingDINO to the following format
+        # [top left x-value, top left y-value, bottom right x-value, bottom right y-value]
+        # and unnormalize the bounding box coordinate values 
+        boxes_xyxy = {}
+        for phrase, boxes in boxes_cxcywh.items():
+            boxes_xyxy[phrase] = []
+            for box in boxes:
+                # Scale the box from percentage values to pixel values
+                box = np.multiply(box, np.array([width, height, width, height]))
+                center_x, center_y, w, h = box
+                # Get the bottom left and top right coordinates of the box
+                x0 = center_x - (w / 2)
+                y0 = center_y - (h / 2)
+                x1 = x0 + w
+                y1 = y0 + h
+                boxes_xyxy[phrase].append([x0, y0, x1, y1])
+                
+        self.get_logger().info(f"Predictions: {boxes_xyxy}")
+        
+        # Measure the elapsed time running GroundingDINO on the image prompt
         inference_time = int(round((time.time() - inference_time) * 1000))
         self.get_logger().info(f"Approx. Inference Time: {inference_time}")
 
-        return bbox_predictions
+        return boxes_xyxy
 
     def load_image(self, image_array: npt.NDArray):
+        # Convert image from BGR to RGB to convert CV2 image to image pillow
+        image_array = cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB)
+
         # Convert image to image pillow to apply transformation
         image_pil = ImagePIL.fromarray(image_array, mode="RGB")
-
+        image_pil.show()
         transform = T.Compose(
             [
                 T.RandomResize([800], max_size=1333),
@@ -746,6 +776,7 @@ def load_image(self, image_array: npt.NDArray):
             ]
         )
         image, _ = transform(image_pil, None)  # 3, h, w
+        
         return image_pil, image
     
     def generate_mask_msg(
@@ -773,22 +804,13 @@ def visualize_groundingdino_results(self, image: Image, predictions: dict):
         image: The image to visualize.
         predictions: The bounding box predictions of GroundingDINO.
         """
-        # Define height and width of image
-        height, width, _ = image.shape
-        self.get_logger().info(f"height, width: {height}, {width}")
+        # Define height of image
+        height, _, _ = image.shape
         
         for phrase, boxes in predictions.items():
             for box in boxes:
-                # Scale the box from percentage values to pixel values
-                box = np.multiply(box, np.array([width, height, width, height]))
-                center_x, center_y, w, h = box
-                # Get the bottom left and top right coordinates of the box
-                x0 = center_x - (w / 2)
-                y0 = center_y - (h / 2)
-                x1 = x0 + w
-                y1 = y0 + h
-                x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
-                self.get_logger().info(f"box: {x0}, {y0}, {x1}, {y1}")
+                x0, y0, x1, y1 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
+                #self.get_logger().info(f"box: {x0}, {y0}, {x1}, {y1}")
                 color = (0, 255, 0)
                 thickness = 6
                 image = cv2.rectangle(image, (x0, y0), (x1, y1), color, thickness)
@@ -854,12 +876,12 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         # Collect the top contender mask for each food item label detected by 
         # GroundingDINO using EfficientSAM and create dictionary of mask 
         # predictions from the pipeline
-        """
         detected_items = []
         item_labels = []
         for phrase, boxes in bbox_predictions.items():
             for box in boxes:
                 masks, scores = self.run_efficient_sam(image, None, box, 1)
+                self.get_logger().info(f"Mask: {masks[0]}")
                 if len(masks) > 0:
                     mask_msg = self.generate_mask_msg(
                         phrase, scores[0], masks[0], image, depth_img, box
@@ -867,10 +889,11 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
                     detected_items.append(mask_msg)
                     item_labels.append(phrase)
                 break
+        #self.get_logger().info(f"Detected items: {detected_items}")
+        self.get_logger().info(f"Item_labels: {item_labels}")
         result.detected_items = detected_items
         result.item_labels = item_labels
-        """ 
-            
+
         return result 
 
     async def execute_callback(

From 024c71cd88ae2c461dbcaed9f98729dbd09a94b9 Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Mon, 23 Sep 2024 14:40:32 -0700
Subject: [PATCH 17/42] masks are now displayable

---
 .../segment_all_items.py                      | 60 +++++++++++++++----
 1 file changed, 50 insertions(+), 10 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 7ba1dc00..336ad79a 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -12,6 +12,7 @@
 # Third-party imports
 import cv2
 import time 
+import random
 from cv_bridge import CvBridge
 from efficient_sam.efficient_sam import build_efficient_sam
 import numpy as np
@@ -33,6 +34,7 @@
 import torch
 from torchvision import transforms
 from PIL import Image as ImagePIL
+from copy import deepcopy
 
 # Local imports
 from ada_feeding_msgs.action import SegmentAllItems
@@ -767,7 +769,7 @@ def load_image(self, image_array: npt.NDArray):
 
         # Convert image to image pillow to apply transformation
         image_pil = ImagePIL.fromarray(image_array, mode="RGB")
-        image_pil.show()
+        #image_pil.show()
         transform = T.Compose(
             [
                 T.RandomResize([800], max_size=1333),
@@ -804,8 +806,11 @@ def visualize_groundingdino_results(self, image: Image, predictions: dict):
         image: The image to visualize.
         predictions: The bounding box predictions of GroundingDINO.
         """
+        # Create a deep copy of the image to visualize
+        image_copy = deepcopy(image)
+
         # Define height of image
-        height, _, _ = image.shape
+        height, _, _ = image_copy.shape
         
         for phrase, boxes in predictions.items():
             for box in boxes:
@@ -813,18 +818,50 @@ def visualize_groundingdino_results(self, image: Image, predictions: dict):
                 #self.get_logger().info(f"box: {x0}, {y0}, {x1}, {y1}")
                 color = (0, 255, 0)
                 thickness = 6
-                image = cv2.rectangle(image, (x0, y0), (x1, y1), color, thickness)
+                image_copy = cv2.rectangle(image_copy, (x0, y0), (x1, y1), color, thickness)
 
                 # Display text label below bounding box
-                image = cv2.putText(image, phrase, (x0, y0 - 12), 0, 1e-3 * height, color, thickness // 3)
+                image_copy = cv2.putText(image_copy, phrase, (x0, y0 - 12), 0, 1e-3 * height, color, thickness // 3)
 
-        cv2.imshow('image', image)
-        cv2.waitKey(0)
-        cv2.destroyAllWindows()
+        #cv2.imshow('image', image_copy)
+        #cv2.waitKey(0)
+        #cv2.destroyAllWindows()
         # Publish the image
         self.viz_groundingdino_pub.publish(
-            cv2_image_to_ros_msg(image, compress=False, bridge=self.bridge)
+            cv2_image_to_ros_msg(image_copy, compress=False, bridge=self.bridge)
         )
+
+    def display_masks(self, image: npt.NDArray, masks: list[npt.NDArray], item_labels: list[str]):
+        """
+        Display the masks on the image.
+
+        Parameters
+        ----------
+        image: The image to display the masks on.
+        masks: The masks to display on the image.
+        """
+        self.get_logger().info("Displaying masks...")
+
+        # Create a deep copy of the image to visualize
+        image_copy = deepcopy(image)
+
+        # Display the masks on the image
+        for mask, label in zip(masks, item_labels):
+            mask = mask.astype(np.uint8)
+            mask = cv2.resize(mask, (image.shape[1], image.shape[0]))
+            color_dims = 3
+            mask = np.stack([mask] * color_dims, axis=-1)
+            color_scalar = random.randint(80, 255)
+            mask = np.multiply(mask, color_scalar)
+            cv2.imshow(label, mask)
+            cv2.waitKey(0)
+            cv2.destroyAllWindows()
+            #self.get_logger().info(f"Mask max: {np.max(mask)}")
+            #image_copy = cv2.addWeighted(image, 1.0, mask, 0.3, 0)
+
+        #cv2.imshow("Masks", image)
+        #cv2.waitKey(0)
+        #cv2.destroyAllWindows()
     
     async def run_vision_pipeline(self, image_msg: Image, caption: str):
         """
@@ -878,17 +915,20 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         # predictions from the pipeline
         detected_items = []
         item_labels = []
+        masks_list = []
         for phrase, boxes in bbox_predictions.items():
             for box in boxes:
                 masks, scores = self.run_efficient_sam(image, None, box, 1)
-                self.get_logger().info(f"Mask: {masks[0]}")
                 if len(masks) > 0:
+                    self.get_logger().info(f"Mask: {masks[0]}")
+                    masks_list.append(masks[0])
                     mask_msg = self.generate_mask_msg(
                         phrase, scores[0], masks[0], image, depth_img, box
                     )
                     detected_items.append(mask_msg)
                     item_labels.append(phrase)
-                break
+                    
+        self.display_masks(image, masks_list, item_labels)
         #self.get_logger().info(f"Detected items: {detected_items}")
         self.get_logger().info(f"Item_labels: {item_labels}")
         result.detected_items = detected_items

From c78cd4a0d3da3d5e99ff58e59761be4efaab2f76 Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Tue, 24 Sep 2024 13:30:57 -0700
Subject: [PATCH 18/42] record vision pipeline inference time

---
 .../ada_feeding_perception/segment_all_items.py          | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 336ad79a..2699e5de 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -883,6 +883,11 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         
         """
         self.get_logger().info("Running the vision pipeline...")
+
+        # Set the initial time to measure the elapsed time running GroundingDINO on the 
+        # desired image and text prompts.
+        inference_time = time.time()
+
         # Define the result and create result message header
         result = SegmentAllItems.Result()
         result.header = image_msg.header
@@ -934,6 +939,10 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         result.detected_items = detected_items
         result.item_labels = item_labels
 
+        # Measure the elapsed time running GroundingDINO on the image prompt
+        inference_time = int(round((time.time() - inference_time) * 1000))
+        self.get_logger().info(f"VISION PIPELINE - Approx. Inference Time: {inference_time}")
+
         return result 
 
     async def execute_callback(

From e503800e55e7b0fdb07e0c329c5ddcbf4b01bf9b Mon Sep 17 00:00:00 2001
From: SoundCloudRapper365 <sriramku@outlook.com>
Date: Fri, 27 Sep 2024 14:35:30 -0700
Subject: [PATCH 19/42] wrote code to generate mask messages during action
 calls

---
 .../segment_all_items.py                      | 82 ++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 2699e5de..c791b6cb 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -557,6 +557,30 @@ def cancel_callback(self_: ServerGoalHandle) -> CancelResponse:
         self.get_logger().info("Cancelling the goal request...")
         return CancelResponse.ACCEPT
 
+    def run_gpt4v(self, image: npt.NDArray, system_prompt: str, user_prompt: str):
+        """
+        Run GPT-4V on the image.
+
+        Parameters
+        ----------
+        image: The image to segment, in BGR.
+        system_prompt: The system prompt to use for GPT-4V.
+        user_prompt: The user prompt to use for GPT-4V.
+
+        Returns
+        -------
+        The caption generated by GPT-4V.
+        """
+        self.get_logger().info("Running GPT-4V...")
+
+        # Convert image from BGR to RGB for GPT-4V
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        # Run GPT-4V on the image using the input prompts
+        caption = ""
+        
+        return caption
+
     def run_sam(
         self, 
         image: npt.NDArray, 
@@ -784,6 +808,7 @@ def load_image(self, image_array: npt.NDArray):
     def generate_mask_msg(
         self, 
         item_id: str, 
+        object_id: str,
         score: float, 
         mask:npt.NDArray[np.bool_], 
         image: npt.NDArray, 
@@ -792,8 +817,60 @@ def generate_mask_msg(
     ) -> Optional[Mask]:
         """
         """
+        # Calculate center of the bounding box and use it as the seed point for
+        # getting the connected component of the mask
+        center_x = (bbox[0] + bbox[2]) // 2
+        center_y = (bbox[1] + bbox[3]) // 2
+        seed_point = (center_x, center_y)
+
+        # Use the mask to get a connected component containing the seed point 
+        cleaned_mask = get_connected_component(mask, seed_point)
         
+        # Use the cleaned mask to calculate the median depth over the mask
+        masked_depth = depth_img[cleaned_mask]
+        median_depth_mm = np.median(
+            masked_depth[
+                np.logical_and(
+                    masked_depth >= self.min_depth_mm, masked_depth <= self.max_depth_mm
+                )
+            ]
+        )
+        # If the depth is invalid, skip this mask and return None
+        if np.isnan(median_depth_mm):
+            self.get_logger().warn(
+                f"No depth points within [{self.min_depth_mm}, {self.max_depth_mm}] mm range "
+                f"for mask {item_id}. Skipping mask."
+            )
+            return None
+
+        # Crop the image and the mask
+        cropped_image, cropped_mask = crop_image_mask_and_point(
+            image, cleaned_mask, seed_point, bbox
+        )
+        cropped_depth, _, _ = crop_image_mask_and_point(
+            depth_img, cleaned_mask, seed_point, bbox
+        )
+
+        # Convert the mask to an image
+        mask_img = np.where(cropped_mask, 255, 0).astype(np.uint8)
+
+        # Create the ROS mask message
         mask_msg = Mask()
+        mask_msg.roi = RegionOfInterest(
+            x_offset=bbox[0],
+            y_offset=bbox[1],
+            height=int(bbox[3] - bbox[1]),
+            width=int(bbox[2] - bbox[0]),
+            do_rectify=False,
+        )
+        mask_msg.mask = cv2_image_to_ros_msg(mask_img, compress=True)
+        mask_msg.rgb_image = cv2_image_to_ros_msg(cropped_image, compress=True)
+        mask_msg.depth_image = cv2_image_to_ros_msg(cropped_depth, compress=False)
+        mask_msg.average_depth = median_depth_mm / 1000.0
+        mask_msg.item_id = item_id
+        mask_msg.object_id = object_id
+        mask_msg.score = score
+
         return mask_msg
     
     def visualize_groundingdino_results(self, image: Image, predictions: dict):
@@ -921,14 +998,17 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         detected_items = []
         item_labels = []
         masks_list = []
+        mask_num = 1
         for phrase, boxes in bbox_predictions.items():
             for box in boxes:
                 masks, scores = self.run_efficient_sam(image, None, box, 1)
                 if len(masks) > 0:
                     self.get_logger().info(f"Mask: {masks[0]}")
                     masks_list.append(masks[0])
+                    item_id = f"food_id_{mask_num:d}"
+                    mask_num += 1
                     mask_msg = self.generate_mask_msg(
-                        phrase, scores[0], masks[0], image, depth_img, box
+                        item_id, phrase, scores[0], masks[0], image, depth_img, box
                     )
                     detected_items.append(mask_msg)
                     item_labels.append(phrase)

From 648a46e09db9a2a498e2aea827d84029cf5a8d99 Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Mon, 30 Sep 2024 14:20:35 -0700
Subject: [PATCH 20/42] masks msgs are generated but action keeps aborting

---
 .../segment_all_items.py                      | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index c791b6cb..3c75a80b 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -40,6 +40,7 @@
 from ada_feeding_msgs.action import SegmentAllItems
 from ada_feeding_msgs.msg import Mask
 from ada_feeding_perception.helpers import (
+    BoundingBox,
     bbox_from_mask,
     crop_image_mask_and_point,
     cv2_image_to_ros_msg,
@@ -751,7 +752,7 @@ def run_grounding_dino(
         for logit, box in zip(logits_filt, boxes_filt):
             # Predict phrases based on the bounding boxes and the text threshold
             phrase = get_phrases_from_posmap(logit > text_threshold, caption_tokens, tokenizer)
-            self.get_logger().info(f"logit: {logit}, box: {box}")
+            #self.get_logger().info(f"logit: {logit}, box: {box}")
             self.get_logger().info(f"{phrase}")
             if phrase not in boxes_cxcywh:
                 boxes_cxcywh[phrase] = []
@@ -821,6 +822,8 @@ def generate_mask_msg(
         # getting the connected component of the mask
         center_x = (bbox[0] + bbox[2]) // 2
         center_y = (bbox[1] + bbox[3]) // 2
+        center_x = int(center_x)
+        center_y = int(center_y)
         seed_point = (center_x, center_y)
 
         # Use the mask to get a connected component containing the seed point 
@@ -843,12 +846,16 @@ def generate_mask_msg(
             )
             return None
 
+        # Convert the bounding box from a Python Tuple into the BoundingBox class
+        # from helpers.py to call the crop_image_mask_and_point 
+        bbox_converted = BoundingBox(int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]))
+
         # Crop the image and the mask
-        cropped_image, cropped_mask = crop_image_mask_and_point(
-            image, cleaned_mask, seed_point, bbox
+        cropped_image, cropped_mask, _ = crop_image_mask_and_point(
+            image, cleaned_mask, seed_point, bbox_converted
         )
         cropped_depth, _, _ = crop_image_mask_and_point(
-            depth_img, cleaned_mask, seed_point, bbox
+            depth_img, cleaned_mask, seed_point, bbox_converted
         )
 
         # Convert the mask to an image
@@ -857,8 +864,8 @@ def generate_mask_msg(
         # Create the ROS mask message
         mask_msg = Mask()
         mask_msg.roi = RegionOfInterest(
-            x_offset=bbox[0],
-            y_offset=bbox[1],
+            x_offset=int(bbox[0]),
+            y_offset=int(bbox[1]),
             height=int(bbox[3] - bbox[1]),
             width=int(bbox[2] - bbox[0]),
             do_rectify=False,
@@ -869,7 +876,7 @@ def generate_mask_msg(
         mask_msg.average_depth = median_depth_mm / 1000.0
         mask_msg.item_id = item_id
         mask_msg.object_id = object_id
-        mask_msg.score = score
+        mask_msg.confidence = float(score)
 
         return mask_msg
     
@@ -1003,7 +1010,7 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
             for box in boxes:
                 masks, scores = self.run_efficient_sam(image, None, box, 1)
                 if len(masks) > 0:
-                    self.get_logger().info(f"Mask: {masks[0]}")
+                    #self.get_logger().info(f"Mask: {masks[0]}")
                     masks_list.append(masks[0])
                     item_id = f"food_id_{mask_num:d}"
                     mask_num += 1
@@ -1013,7 +1020,7 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
                     detected_items.append(mask_msg)
                     item_labels.append(phrase)
                     
-        self.display_masks(image, masks_list, item_labels)
+        #self.display_masks(image, masks_list, item_labels)
         #self.get_logger().info(f"Detected items: {detected_items}")
         self.get_logger().info(f"Item_labels: {item_labels}")
         result.detected_items = detected_items

From 3032f65296e78a2cc3f21266028977a90c407e48 Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Wed, 6 Nov 2024 19:12:28 -0800
Subject: [PATCH 21/42] Added gpt-4o query functionality

---
 .../segment_all_items.py                      | 120 ++++++++++++++----
 .../config/segment_all_items.yaml             |   2 +-
 requirements.txt                              |   1 +
 3 files changed, 100 insertions(+), 23 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 3c75a80b..a4ff87d5 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -35,6 +35,8 @@
 from torchvision import transforms
 from PIL import Image as ImagePIL
 from copy import deepcopy
+import base64
+from openai import OpenAI
 
 # Local imports
 from ada_feeding_msgs.action import SegmentAllItems
@@ -180,6 +182,10 @@ def __init__(self):
                 Image, "~/groundingdino_detection", 1
             )
 
+        # Initialize the OpenAI API and load environment variables
+        API_KEY = ""
+        self.openai = OpenAI(api_key=API_KEY)
+
     def read_params(
         self,
     ) -> Tuple[Parameter, Parameter, Parameter, Parameter, Parameter]:
@@ -558,29 +564,80 @@ def cancel_callback(self_: ServerGoalHandle) -> CancelResponse:
         self.get_logger().info("Cancelling the goal request...")
         return CancelResponse.ACCEPT
 
-    def run_gpt4v(self, image: npt.NDArray, system_prompt: str, user_prompt: str):
+    def run_gpt4o(self, image: npt.NDArray, labels_list: list[str]):
         """
         Run GPT-4V on the image.
 
         Parameters
         ----------
         image: The image to segment, in BGR.
-        system_prompt: The system prompt to use for GPT-4V.
-        user_prompt: The user prompt to use for GPT-4V.
+        system_prompt: The system prompt to use for GPT-4o.
+        user_prompt: The user prompt to use for GPT-4o.
 
         Returns
         -------
-        The caption generated by GPT-4V.
+        The caption generated by GPT-4o.
         """
-        self.get_logger().info("Running GPT-4V...")
+        self.get_logger().info("Running GPT-4o...")
 
-        # Convert image from BGR to RGB for GPT-4V
-        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Encode the image to JPEG format
+        _, buffer = cv2.imencode(".jpg", image)
 
-        # Run GPT-4V on the image using the input prompts
-        caption = ""
-        
-        return caption
+        # Convert the buffer to the base 64 encoded image for GPT-4o
+        image_base64 = base64.b64encode(buffer).decode("utf-8")
+
+        # Write the system and user prompts for GPT-4o
+        system_query = f"""
+            You are a prompt engineer that is assigned to describe items  
+            in an image you have been queried so that a vision language model 
+            can take in your prompt as a query and use it to for classification 
+            tasks. You respond in string format and do not provide any explanation 
+            for your responses.  
+        """
+        user_query = f"""
+            Your objective is to generate a sentence prompt that describes the food
+            items on a plate in an image. 
+            You are given an image of a plate with food items and a list of the food items 
+            on the plate.
+            Please compile the inputs from the list into a sentence prompt that effectively
+            lists the food items on the plate.
+            Add qualifiers to the prompt to better visually describe the food for the VLM
+            to identify. Don't add any irrelevant qualifiers.
+
+            Here is the input list of food items to compile into a string: {labels_list}
+
+            Here are some sample responses that convey how you should format your responses:
+            
+            "Food items including grapes, strawberries, blueberries, melon chunks, and 
+            carrots on a small, blue plate."
+
+            "Food items including strips of grilled meat and seasoned cucumber 
+            spears arranged on a light gray plate."
+
+            "Food items including baked chicken pieces, black olives, bell pepper slices, 
+            and artichoke on a plate."
+        """
+
+        # Run GPT-4o to generate a sentence caption given the user query, system query, 
+        # and image prompt
+        response = self.openai.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[{"role": "system", "content": system_query}, 
+                      {"role": "user",
+                       "content": [
+                           {"type": "text", "text": user_query},
+                           {
+                               "type": "image_url",
+                               "image_url": {
+                                   "url": f"data:image/jpeg;base64,{image_base64}"
+                               }
+                           }
+                       ]}],
+        )
+
+        vlm_query = response.choices[0].message.content
+
+        return vlm_query
 
     def run_sam(
         self, 
@@ -722,7 +779,7 @@ def run_grounding_dino(
 
         # Run Open-GroundingDINO on the image using the input caption
         image_transformed = image_transformed.to(device=self.device)
-        self.get_logger().info(f"device: {self.device}")
+        #self.get_logger().info(f"device: {self.device}")
         with torch.no_grad():
             outputs = self.groundingdino(
                 image_transformed[None],
@@ -740,9 +797,9 @@ def run_grounding_dino(
         logits_filt = logits_filt[filt_thresh_mask]
         boxes_filt = boxes_filt[filt_thresh_mask]
 
-        self.get_logger().info(f"Caption: {caption}")
-        self.get_logger().info(f"Boxes: {boxes_filt}")
-        self.get_logger().info(f"Logits: {logits_filt}")
+        #self.get_logger().info(f"Caption: {caption}")
+        #self.get_logger().info(f"Boxes: {boxes_filt}")
+        #self.get_logger().info(f"Logits: {logits_filt}")
 
         # Tokenize the caption
         tokenizer = self.groundingdino.tokenizer
@@ -753,7 +810,7 @@ def run_grounding_dino(
             # Predict phrases based on the bounding boxes and the text threshold
             phrase = get_phrases_from_posmap(logit > text_threshold, caption_tokens, tokenizer)
             #self.get_logger().info(f"logit: {logit}, box: {box}")
-            self.get_logger().info(f"{phrase}")
+            #self.get_logger().info(f"{phrase}")
             if phrase not in boxes_cxcywh:
                 boxes_cxcywh[phrase] = []
             boxes_cxcywh[phrase].append(box.cpu().numpy())
@@ -761,7 +818,7 @@ def run_grounding_dino(
 
         # Define height and width of image
         height, width, _ = image.shape
-        self.get_logger().info(f"height, width: {height}, {width}")
+        #self.get_logger().info(f"height, width: {height}, {width}")
 
         # Convert the bounding boxes outputted by GroundingDINO to the following format
         # [top left x-value, top left y-value, bottom right x-value, bottom right y-value]
@@ -780,11 +837,11 @@ def run_grounding_dino(
                 y1 = y0 + h
                 boxes_xyxy[phrase].append([x0, y0, x1, y1])
                 
-        self.get_logger().info(f"Predictions: {boxes_xyxy}")
+        #self.get_logger().info(f"Predictions: {boxes_xyxy}")
         
         # Measure the elapsed time running GroundingDINO on the image prompt
         inference_time = int(round((time.time() - inference_time) * 1000))
-        self.get_logger().info(f"Approx. Inference Time: {inference_time}")
+        #self.get_logger().info(f"Approx. Inference Time: {inference_time}")
 
         return boxes_xyxy
 
@@ -907,9 +964,9 @@ def visualize_groundingdino_results(self, image: Image, predictions: dict):
                 # Display text label below bounding box
                 image_copy = cv2.putText(image_copy, phrase, (x0, y0 - 12), 0, 1e-3 * height, color, thickness // 3)
 
-        #cv2.imshow('image', image_copy)
-        #cv2.waitKey(0)
-        #cv2.destroyAllWindows()
+        cv2.imshow('image', image_copy)
+        cv2.waitKey(0)
+        cv2.destroyAllWindows()
         # Publish the image
         self.viz_groundingdino_pub.publish(
             cv2_image_to_ros_msg(image_copy, compress=False, bridge=self.bridge)
@@ -946,6 +1003,20 @@ def display_masks(self, image: npt.NDArray, masks: list[npt.NDArray], item_label
         #cv2.imshow("Masks", image)
         #cv2.waitKey(0)
         #cv2.destroyAllWindows()
+
+    def display_mask(self, image: npt.NDArray, mask: npt.NDArray):
+        """
+        """
+        image_copy = deepcopy(image)
+        mask_int = mask.astype(np.uint8)
+        mask_int = cv2.resize(mask_int, (image.shape[1], image.shape[0]))
+        color_dims = 3
+        mask_int = np.stack([mask_int] * color_dims, axis=-1)
+        color_scalar = random.randint(80, 255)
+        mask_int = np.multiply(mask_int, color_scalar)
+        cv2.imshow('mask', mask_int)
+        cv2.waitKey(0)
+        cv2.destroyAllWindows()
     
     async def run_vision_pipeline(self, image_msg: Image, caption: str):
         """
@@ -994,6 +1065,11 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         # Run Open-GroundingDINO on the image
         bbox_predictions = self.run_grounding_dino(image, caption, self.box_threshold, self.text_threshold)
 
+        # Run GPT-4o on the image and caption to generate a sentence prompt
+        # for the food items detected in the image
+        gpt4o_query = self.run_gpt4o(image, caption)
+        self.get_logger().info(f"GPT-4o Query: {gpt4o_query}")
+
         # Publish a visualization of the GroundingDINO predictions, if the visualization
         # flag is set to true 
         if self.viz_groundingdino:
diff --git a/ada_feeding_perception/config/segment_all_items.yaml b/ada_feeding_perception/config/segment_all_items.yaml
index 0dc77823..80da2dff 100644
--- a/ada_feeding_perception/config/segment_all_items.yaml
+++ b/ada_feeding_perception/config/segment_all_items.yaml
@@ -29,4 +29,4 @@ segment_all_items:
 
     # A boolean to determine whether to visualize the bounding box predictions 
     # made by GroundingDINO
-    viz_groundingdino: true
+    viz_groundingdino: false
diff --git a/requirements.txt b/requirements.txt
index f8f6fc95..5bfbed7e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ pyrealsense2
 overrides
 sounddevice
 scikit-spatial
+openai
 git+https://github.com/facebookresearch/segment-anything.git
 git+https://github.com/yformer/EfficientSAM.git
 git+https://github.com/IDEA-Research/GroundingDINO.git

From 85f9577f2d826f94e6ed34e2c960ad7ec46f9a2b Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramkutty@Srirams-MacBook-Air.local>
Date: Thu, 7 Nov 2024 22:19:36 -0800
Subject: [PATCH 22/42] groundingdino can be downloaded via github url

---
 .../segment_all_items.py                      | 29 +++++++++++++++----
 .../config/segment_all_items.yaml             |  2 ++
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index a4ff87d5..d40197e7 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -92,8 +92,14 @@ def __init__(self):
             download_checkpoint(seg_model_name, model_dir, seg_model_base_url)
             self.get_logger().info(f"Model checkpoint downloaded {seg_model_path}.")
 
-        # Set the path to GroundingDINO and its config file in the model directory 
+        # Download the checkpoint for GroundingDINO if it doesn't exist
         groundingdino_model_path = os.path.join(model_dir, groundingdino_model_name)
+        if not os.path.isfile(groundingdino_model_path):
+            self.get_logger().info("Model checkpoint does not exist. Downloading...")
+            download_checkpoint(groundingdino_model_name, model_dir)
+            self.get_logger().info(f"Model checkpoint downloaded {groundingdino_model_path}.")
+
+        # Set the path to the GroundingDINO configurations file in the model directory 
         groundingdino_config_path = os.path.join(model_dir, groundingdino_config_name)
 
         # Subscribe to the camera info topic, to get the camera intrinsics
@@ -265,7 +271,7 @@ def read_params(
                     ParameterDescriptor(
                         name="groundingdino_config_name",
                         type=ParameterType.PARAMETER_STRING,
-                        description="The name of the configuration file to use for Open-GroundingDINO",
+                        description="The name of the configuration file to use for GroundingDINO",
                         read_only=True,
                     ),
                 ),
@@ -275,7 +281,20 @@ def read_params(
                     ParameterDescriptor(
                         name="groundingdino_model_name",
                         type=ParameterType.PARAMETER_STRING,
-                        description="The name of the model checkpoint to use for Open-GroundingDINO",
+                        description="The name of the model checkpoint to use for GroundingDINO",
+                        read_only=True,
+                    ),
+                ),
+                (
+                    "groundingdino_model_base_url",
+                    None,
+                    ParameterDescriptor(
+                        name="groundingdino_model_base_url",
+                        type=ParameterType.PARAMETER_STRING,
+                        description=(
+                            "The URL to download the model checkpoint from if "
+                            "it is not already downloaded for GroundingDINO"
+                        ),
                         read_only=True,
                     ),
                 ),
@@ -419,8 +438,6 @@ def initialize_grounding_dino(
 
         self.get_logger().info("...Done!")
 
-    # Move bottom two functions to helpers file and import them for 
-    # both segment_all_items.py and segment_from_point.py    
     def initialize_sam(self, model_name: str, model_path: str) -> None:
         """
         Initialize all attributes needed for food segmentation with SAM.
@@ -558,7 +575,7 @@ def goal_callback(self, goal_request: SegmentAllItems.Goal) -> GoalResponse:
             )
             return GoalResponse.REJECT
     
-    def cancel_callback(self_: ServerGoalHandle) -> CancelResponse:
+    def cancel_callback(self, _: ServerGoalHandle) -> CancelResponse:
         """
         """
         self.get_logger().info("Cancelling the goal request...")
diff --git a/ada_feeding_perception/config/segment_all_items.yaml b/ada_feeding_perception/config/segment_all_items.yaml
index 80da2dff..0a82d745 100644
--- a/ada_feeding_perception/config/segment_all_items.yaml
+++ b/ada_feeding_perception/config/segment_all_items.yaml
@@ -15,6 +15,8 @@ segment_all_items:
     groundingdino_config_name: GroundingDINO_SwinT_OGC.py
     # The name of the GroundingDINO model checkpoint to use
     groundingdino_model_name: groundingdino_swint_ogc.pth
+    # The URL to download the model checkpoint from if it is not already downloaded
+    groundingdino_model_base_url: "https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth"
 
     # Whether to use SAM or EfficientSAM
     use_efficient_sam: true

From 004959856644ef6e2c8684490aa54842df921c18 Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramkutty@Srirams-MacBook-Air.local>
Date: Fri, 8 Nov 2024 09:54:21 -0800
Subject: [PATCH 23/42] updated comments/code quality changes

---
 .../segment_all_items.py                      | 67 +++++++++++++------
 1 file changed, 48 insertions(+), 19 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index d40197e7..a765ca14 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -546,6 +546,12 @@ def image_callback(self, msg: Union[Image, CompressedImage]) -> None:
     
     def goal_callback(self, goal_request: SegmentAllItems.Goal) -> GoalResponse:
         """
+        Accept or reject the goal request based on the availability of the latest
+        RGB and depth images.
+
+        Parameters
+        ----------
+        goal_request: The goal request.
         """
         # If no RGB image is received, reject the goal request
         with self.latest_img_msg_lock:
@@ -577,23 +583,31 @@ def goal_callback(self, goal_request: SegmentAllItems.Goal) -> GoalResponse:
     
     def cancel_callback(self, _: ServerGoalHandle) -> CancelResponse:
         """
+        Always accept the cancel request, however, 'execute_callback'
+        will wait for segmentation to complete and not interrupt the process
+        in response to a cancel request. 
+
+        Parameters
+        ----------
+        goal_handle: The goal handle.
         """
         self.get_logger().info("Cancelling the goal request...")
         return CancelResponse.ACCEPT
 
     def run_gpt4o(self, image: npt.NDArray, labels_list: list[str]):
         """
-        Run GPT-4V on the image.
+        Run GPT-4o on the image.
 
         Parameters
         ----------
-        image: The image to segment, in BGR.
-        system_prompt: The system prompt to use for GPT-4o.
-        user_prompt: The user prompt to use for GPT-4o.
+        image: A CV2 above the plate image that is used as reference for GPT-4o 
+                to generate a caption input for GroundingDINO.
+        labels_list: The list of food items to compile into a sentence prompt.
 
         Returns
         -------
-        The caption generated by GPT-4o.
+        vlm_query: The caption generated by GPT-4o that is used as text input for 
+                   GroundingDINO.
         """
         self.get_logger().info("Running GPT-4o...")
 
@@ -668,7 +682,7 @@ def run_sam(
 
         Parameters
         ----------
-        image: The image to segment, in BGR.
+        image: The image to perform segmentation on.
         seed_point: The seed point for SAM to segment from.
         bbox: The bounding box prompt for SAM to segment. 
         prompt: The prompt to use for SAM. If 0, use the seed point prompt. 
@@ -676,7 +690,8 @@ def run_sam(
 
         Returns
         -------
-        A list of tuples containing the confidence and mask for each segmentation.
+        masks: The masks for each segmentation.
+        scores: The confidence scores for each segmentation.
         """
         self.get_logger().info("Segmenting image with SAM...")
 
@@ -711,7 +726,7 @@ def run_efficient_sam(
 
         Parameters
         ----------
-        image: The image to segment, in BGR.
+        image: The image to perform segmentation on.
         seed_point: The seed point for EfficientSAM to segment from.
         bbox: The bounding box prompt for EfficientSAM to segment. 
         prompt: The prompt to use for EfficientSAM. If 0, use the seed point prompt. 
@@ -772,8 +787,9 @@ def run_grounding_dino(
 
         Parameters
         ----------
-        image: The CV2 image to retrieve semantically labeled bounding boxes from.
-        caption: The caption to use for Open-GroundingDINO.
+        image: The CV2 above the plate image to retrieve semantically labeled bounding 
+               boxes from.
+        caption: The caption to use as text input for GroundingDINO.
         box_threshold: The threshold for the bounding box.
         text_threshold: The threshold for the text.
 
@@ -863,6 +879,18 @@ def run_grounding_dino(
         return boxes_xyxy
 
     def load_image(self, image_array: npt.NDArray):
+        """
+        Load the image and apply transformations to it.
+
+        Parameters
+        ----------
+        image_array: The image to load.
+
+        Returns
+        -------
+        image_pil: The image in Image pillow format.
+        image: The image in tensor format.
+        """
         # Convert image from BGR to RGB to convert CV2 image to image pillow
         image_array = cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB)
 
@@ -1037,12 +1065,12 @@ def display_mask(self, image: npt.NDArray, mask: npt.NDArray):
     
     async def run_vision_pipeline(self, image_msg: Image, caption: str):
         """
-        Run the vision pipeline consisting of GroundingDINO and EfficientSAM on the image.
-        The caption and latest image are prompted to GroundingDINO which outputs bounding boxes
-        for each food item label in the caption detected in the image. The detected items are then
-        segmented by passing in the bounding box detections into EfficientSAM which outputs
-        pixel-wise masks for each bounding box. The top masks are then returned along with the 
-        food item label for each mask as a dictionary. 
+        Run the vision pipeline consisting of two foundation models, GroundingDINO and 
+        EfficientSAM, on the image. GroundingDINO is prompted with a caption and the latest image,  
+        and outputs bounding boxes for each semantic label in the caption detected in the 
+        image. The detected items are then segmented by passing in the bounding box detections 
+        into EfficientSAM which outputs pixel-wise masks for each bounding box. The top masks 
+        are then returned along with the semantic label for each mask as a dictionary. 
 
         Parameters
         ----------
@@ -1052,7 +1080,8 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
 
         Returns
         -------
-        
+        result: The result message containing masks for all food items detected in the image
+                paired with semantic labels.
         """
         self.get_logger().info("Running the vision pipeline...")
 
@@ -1137,8 +1166,8 @@ async def execute_callback(
 
         Returns
         -------
-        The result of the action server containing masks for all food items 
-        detected in the image.
+        result: The result message containing masks for all food items detected in the image
+                paired with semantic labels.
         """
         starting_time = self.get_clock().now()
         self.get_logger().info("Received a new goal!")

From e9fd4d5e80c1030989d7991b60588ee2cc8a8ffe Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Fri, 8 Nov 2024 15:34:43 -0800
Subject: [PATCH 24/42] invoking gpt-4o has been transformed into a service

---
 ada_feeding_msgs/CMakeLists.txt               |  1 +
 ada_feeding_msgs/srv/GenerateCaption.srv      | 11 +++
 .../segment_all_items.py                      | 85 ++++++++++++++-----
 3 files changed, 78 insertions(+), 19 deletions(-)
 create mode 100644 ada_feeding_msgs/srv/GenerateCaption.srv

diff --git a/ada_feeding_msgs/CMakeLists.txt b/ada_feeding_msgs/CMakeLists.txt
index 6ad2b3f0..d4effcf8 100644
--- a/ada_feeding_msgs/CMakeLists.txt
+++ b/ada_feeding_msgs/CMakeLists.txt
@@ -36,6 +36,7 @@ rosidl_generate_interfaces(${PROJECT_NAME}
   "srv/AcquisitionSelect.srv"
   "srv/GetRobotState.srv"
   "srv/ToggleFaceDetection.srv"
+  "srv/GenerateCaption.srv"
 
   DEPENDENCIES geometry_msgs sensor_msgs std_msgs
 )
diff --git a/ada_feeding_msgs/srv/GenerateCaption.srv b/ada_feeding_msgs/srv/GenerateCaption.srv
new file mode 100644
index 00000000..8a1d150a
--- /dev/null
+++ b/ada_feeding_msgs/srv/GenerateCaption.srv
@@ -0,0 +1,11 @@
+# The interface for a service that takes in a list of input labels
+# describing the food items on a plate and returns a sentence caption compiling
+# these labels used as a query for GroundingDINO detection.
+
+# A list of semantic labels corresponding to each of the masks of detected
+# items in the image
+string[] input_labels
+---
+# A sentence caption compiling the semantic labels used as a query for 
+# GroundingDINO to perform bounding box detections.
+string caption
\ No newline at end of file
diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index a765ca14..86254257 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -1,7 +1,8 @@
 """
 This file defines the SegmentAllItems class, which launches an action server that
-segments all food items in the latest image and defines each segmentation with a semantic
-label using GPT-4V, GroundingDINO, and Segment Anything.
+takes a list of labels describing food items on a plate and returns segmentation masks
+of all food items in the latest image and defines each segmentation with a semantic label
+using a pipeline of foundation models including GPT-4o, GroundingDINO, and SegmentAnything.
 """
 
 # Standard imports
@@ -41,6 +42,7 @@
 # Local imports
 from ada_feeding_msgs.action import SegmentAllItems
 from ada_feeding_msgs.msg import Mask
+from ada_feeding_msgs.srv import GenerateCaption
 from ada_feeding_perception.helpers import (
     BoundingBox,
     bbox_from_mask,
@@ -56,7 +58,7 @@ class SegmentAllItemsNode(Node):
     """
     The SegmentAllItemsNode launches an action server that segments all food 
     items in the latest image and defines each segmentation with a semantic
-    label using GPT-4V, GroundingDINO, and Segment Anything.
+    label using GPT-4o, GroundingDINO, and SegmentAnything.
     """
 
     def __init__(self):
@@ -75,6 +77,7 @@ def __init__(self):
             seg_model_base_url,
             groundingdino_config_name,
             groundingdino_model_name,
+            groundingdino_model_base_url,
             model_dir,
             self.use_efficient_sam,
             self.rate_hz,
@@ -96,7 +99,7 @@ def __init__(self):
         groundingdino_model_path = os.path.join(model_dir, groundingdino_model_name)
         if not os.path.isfile(groundingdino_model_path):
             self.get_logger().info("Model checkpoint does not exist. Downloading...")
-            download_checkpoint(groundingdino_model_name, model_dir)
+            download_checkpoint(groundingdino_model_name, model_dir, groundingdino_model_base_url)
             self.get_logger().info(f"Model checkpoint downloaded {groundingdino_model_path}.")
 
         # Set the path to the GroundingDINO configurations file in the model directory 
@@ -169,6 +172,14 @@ def __init__(self):
         self.active_goal_request_lock = threading.Lock()
         self.active_goal_request = None
 
+        # Create the service that invokes GPT-4o
+        self.srv = self.create_service(
+            GenerateCaption,
+            "~/invoke_gpt4o",
+            self.invoke_gpt4o_callback,
+            callback_group=MutuallyExclusiveCallbackGroup(),
+        )
+
         # Create the Action Server.
         # Note: remapping action names does not work: https://github.com/ros2/ros2/issues/1312
         self._action_server = ActionServer(
@@ -208,6 +219,7 @@ def read_params(
             efficient_sam_model_base_url,
             groundingdino_config_name,
             groundingdino_model_name,
+            groundingdino_model_base_url,
             model_dir,
             use_efficient_sam,
             rate_hz,
@@ -399,6 +411,7 @@ def read_params(
             seg_model_base_url,
             groundingdino_config_name.value,
             groundingdino_model_name.value,
+            groundingdino_model_base_url.value,
             model_dir.value,
             use_efficient_sam.value,
             rate_hz.value,
@@ -593,6 +606,45 @@ def cancel_callback(self, _: ServerGoalHandle) -> CancelResponse:
         """
         self.get_logger().info("Cancelling the goal request...")
         return CancelResponse.ACCEPT
+    
+    def invoke_gpt4o_callback(self, request: GenerateCaption.Request, response: GenerateCaption.Response):
+        """
+        Callback function for the GPT-4o service. This function takes in a list 
+        of string labels describing the foods on an image as input and returns a 
+        caption generated by GPT-4o that compiles these labels into a descriptive 
+        sentence used as a query for GroundingDINO.
+
+        Parameters
+        ----------
+        request: The given request message.
+        response: The created response message.
+
+        Returns
+        ----------
+        response: The updated response message based on the request.
+        """
+        # Get the latest image and camera info
+        with self.latest_img_msg_lock:
+            latest_img_msg = self.latest_img_msg
+        with self.camera_info_lock:
+            camera_info = self.camera_info
+
+        # Check if the image and camera info are available
+        if latest_img_msg is None or camera_info is None:
+            self.get_logger().error("Image or camera info not available.")
+            return response
+        
+        # Convert the image message to a CV2 image
+        image = ros_msg_to_cv2_image(latest_img_msg)
+        
+        # Run GPT-4o to generate a caption for the image
+        vlm_query = self.run_gpt4o(image, request.input_labels)
+        self.get_logger().info(f"GPT-4o Query: {vlm_query}")
+
+        # Set the response message to the caption generated by GPT-4o
+        response.caption = vlm_query
+
+        return response
 
     def run_gpt4o(self, image: npt.NDArray, labels_list: list[str]):
         """
@@ -639,14 +691,14 @@ def run_gpt4o(self, image: npt.NDArray, labels_list: list[str]):
 
             Here are some sample responses that convey how you should format your responses:
             
-            "Food items including grapes, strawberries, blueberries, melon chunks, and 
-            carrots on a small, blue plate."
+            Food items including grapes, strawberries, blueberries, melon chunks, and 
+            carrots on a small, blue plate.
 
-            "Food items including strips of grilled meat and seasoned cucumber 
-            spears arranged on a light gray plate."
+            Food items including strips of grilled meat and seasoned cucumber 
+            spears arranged on a light gray plate.
 
-            "Food items including baked chicken pieces, black olives, bell pepper slices, 
-            and artichoke on a plate."
+            Food items including baked chicken pieces, black olives, bell pepper slices, 
+            and artichoke on a plate.
         """
 
         # Run GPT-4o to generate a sentence caption given the user query, system query, 
@@ -666,6 +718,7 @@ def run_gpt4o(self, image: npt.NDArray, labels_list: list[str]):
                        ]}],
         )
 
+        # Get the caption generated by GPT-4o
         vlm_query = response.choices[0].message.content
 
         return vlm_query
@@ -1111,11 +1164,6 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         # Run Open-GroundingDINO on the image
         bbox_predictions = self.run_grounding_dino(image, caption, self.box_threshold, self.text_threshold)
 
-        # Run GPT-4o on the image and caption to generate a sentence prompt
-        # for the food items detected in the image
-        gpt4o_query = self.run_gpt4o(image, caption)
-        self.get_logger().info(f"GPT-4o Query: {gpt4o_query}")
-
         # Publish a visualization of the GroundingDINO predictions, if the visualization
         # flag is set to true 
         if self.viz_groundingdino:
@@ -1200,16 +1248,15 @@ async def execute_callback(
         feedback = SegmentAllItems.Feedback()
         while (
             rclpy.ok() 
-            and not goal_handle.is_cancel_requested()
+            and not goal_handle.is_cancel_requested
             and not vision_pipeline_task.done()
         ):
-            feedback.elapsed_time = ((self.get_clock().now() - starting_time).nanoseconds / 
-                                     1e9).to_msg()
+            feedback.elapsed_time = (self.get_clock().now() - starting_time).to_msg()
             goal_handle.publish_feedback(feedback) 
             rate.sleep()
 
         # If there is a cancel request, cancel the vision pipeline task
-        if goal_handle.is_cancel_requested():
+        if goal_handle.is_cancel_requested:
             self.get_logger().info("Goal cancelled.")
             goal_handle.canceled()
             result = SegmentAllItems.Result()

From 9d52d98ec78aa09d5bb3445df04331250c192a4d Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Fri, 8 Nov 2024 15:46:07 -0800
Subject: [PATCH 25/42] segment all items action now takes a single string as
 input

---
 ada_feeding_msgs/action/SegmentAllItems.action              | 2 +-
 .../ada_feeding_perception/segment_all_items.py             | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/ada_feeding_msgs/action/SegmentAllItems.action b/ada_feeding_msgs/action/SegmentAllItems.action
index f91d8c1b..682a07f3 100644
--- a/ada_feeding_msgs/action/SegmentAllItems.action
+++ b/ada_feeding_msgs/action/SegmentAllItems.action
@@ -2,7 +2,7 @@
 # the masks of all segmented items within that image.
 
 # The list of input semantic labels for the food items on the plate
-string[] input_labels   
+string caption   
 ---
 # Possible return statuses
 uint8 STATUS_SUCCEEDED=0
diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 86254257..da2c07c5 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -1231,10 +1231,8 @@ async def execute_callback(
             self.get_logger().error("Image or camera info not available.")
             return SegmentAllItems.Result()
 
-        # Convert the input label list from goal request to a single string caption
-        # for GroundingDINO
-        caption = ' . '.join(goal_handle.request.input_labels).lower().strip()
-        caption += '.'
+        # Get the caption from the goal request
+        caption = goal_handle.request.caption
         self.get_logger().info(f"caption: {caption}")
 
         # Start running the vision pipeline as a separate thread

From 30bc036537bde12cb3661b74477d8a4d93911768 Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Fri, 8 Nov 2024 16:11:31 -0800
Subject: [PATCH 26/42] added env variables

---
 .gitignore                                                 | 3 +++
 .../ada_feeding_perception/segment_all_items.py            | 7 ++++++-
 requirements.txt                                           | 1 +
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 7c67213b..6058eb87 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,9 @@
 build/
 __pycache__/
 
+# Environment Variables file
+.env
+
 # Compiled Object files
 *.slo
 *.lo
diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index da2c07c5..d9b6b0b2 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -38,6 +38,7 @@
 from copy import deepcopy
 import base64
 from openai import OpenAI
+from dotenv import load_dotenv
 
 # Local imports
 from ada_feeding_msgs.action import SegmentAllItems
@@ -71,6 +72,9 @@ def __init__(self):
         # Check if cuda is available
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
+        # Load environment variables from the .env file
+        load_dotenv()
+
         # Load the parameters'
         (
             seg_model_name,
@@ -200,7 +204,8 @@ def __init__(self):
             )
 
         # Initialize the OpenAI API and load environment variables
-        API_KEY = ""
+        API_KEY = os.getenv("OPENAI_API_KEY")
+        self.get_logger().info(f"API Key: {API_KEY}")
         self.openai = OpenAI(api_key=API_KEY)
 
     def read_params(
diff --git a/requirements.txt b/requirements.txt
index 5bfbed7e..883f008f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,7 @@ overrides
 sounddevice
 scikit-spatial
 openai
+python-dotenv
 git+https://github.com/facebookresearch/segment-anything.git
 git+https://github.com/yformer/EfficientSAM.git
 git+https://github.com/IDEA-Research/GroundingDINO.git

From 4d3b27c93930fa3c42897a5912666ed0a425b4ca Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Fri, 8 Nov 2024 16:24:04 -0800
Subject: [PATCH 27/42] environment variables not loading?

---
 .../segment_all_items.py                      | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index d9b6b0b2..fe0e0102 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -205,7 +205,6 @@ def __init__(self):
 
         # Initialize the OpenAI API and load environment variables
         API_KEY = os.getenv("OPENAI_API_KEY")
-        self.get_logger().info(f"API Key: {API_KEY}")
         self.openai = OpenAI(api_key=API_KEY)
 
     def read_params(
@@ -355,7 +354,7 @@ def read_params(
                         name="box_threshold",
                         type=ParameterType.PARAMETER_DOUBLE,
                         description="The lower threshold for the bounding box detections" + 
-                                    "by Open-GroundingDINO.",
+                                    "by GroundingDINO.",
                         read_only=True,
                     ),
                 ),
@@ -366,7 +365,7 @@ def read_params(
                         name="text_threshold",
                         type=ParameterType.PARAMETER_DOUBLE,
                         description="The lower threshold for the text detections" +
-                                    "by Open-GroundingDINO.",
+                                    "by GroundingDINO.",
                         read_only=True,
                     ),
                 ),
@@ -431,14 +430,14 @@ def initialize_grounding_dino(
         self, groundingdino_config_path: str, groundingdino_model_path: str
     ) -> None:
         """
-        Initialize the Open-GroundingDINO model.
+        Initialize the GroundingDINO model.
 
         Parameters
         ----------
-        groundingdino_config_path: The path to the Open-GroundingDINO configuration file.
-        groundingdino_model_path: The path to the Open-GroundingDINO model checkpoint.
+        groundingdino_config_path: The path to the GroundingDINO configuration file.
+        groundingdino_model_path: The path to the GroundingDINO model checkpoint.
         """
-        self.get_logger().info("Initializing Open-GroundingDINO...")
+        self.get_logger().info("Initializing GroundingDINO...")
 
         # Get model configuration arguments from the configuration file
         config_args = SLConfig.fromfile(groundingdino_config_path)
@@ -841,7 +840,7 @@ def run_grounding_dino(
         text_threshold: float, 
     ):
         """
-        Run Open-GroundingDINO on the image.
+        Run GroundingDINO on the image.
 
         Parameters
         ----------
@@ -868,7 +867,7 @@ def run_grounding_dino(
         # Lowercase and strip the caption
         caption = caption.lower().strip()
 
-        # Run Open-GroundingDINO on the image using the input caption
+        # Run GroundingDINO on the image using the input caption
         image_transformed = image_transformed.to(device=self.device)
         #self.get_logger().info(f"device: {self.device}")
         with torch.no_grad():
@@ -1166,7 +1165,7 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         # Convert the image to OpenCV format 
         image = ros_msg_to_cv2_image(image_msg, self.bridge)
 
-        # Run Open-GroundingDINO on the image
+        # Run GroundingDINO on the image
         bbox_predictions = self.run_grounding_dino(image, caption, self.box_threshold, self.text_threshold)
 
         # Publish a visualization of the GroundingDINO predictions, if the visualization

From 94af48e3e0f0a9e8a87872da3ecba61bc90e60a7 Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramku@outlook.com>
Date: Fri, 8 Nov 2024 16:35:26 -0800
Subject: [PATCH 28/42] ran black formatter

---
 .../segment_all_items.py                      | 319 ++++++++++--------
 1 file changed, 176 insertions(+), 143 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index fe0e0102..f2508e29 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -12,7 +12,7 @@
 
 # Third-party imports
 import cv2
-import time 
+import time
 import random
 from cv_bridge import CvBridge
 from efficient_sam.efficient_sam import build_efficient_sam
@@ -55,9 +55,10 @@
     ros_msg_to_cv2_image,
 )
 
+
 class SegmentAllItemsNode(Node):
     """
-    The SegmentAllItemsNode launches an action server that segments all food 
+    The SegmentAllItemsNode launches an action server that segments all food
     items in the latest image and defines each segmentation with a semantic
     label using GPT-4o, GroundingDINO, and SegmentAnything.
     """
@@ -103,10 +104,14 @@ def __init__(self):
         groundingdino_model_path = os.path.join(model_dir, groundingdino_model_name)
         if not os.path.isfile(groundingdino_model_path):
             self.get_logger().info("Model checkpoint does not exist. Downloading...")
-            download_checkpoint(groundingdino_model_name, model_dir, groundingdino_model_base_url)
-            self.get_logger().info(f"Model checkpoint downloaded {groundingdino_model_path}.")
+            download_checkpoint(
+                groundingdino_model_name, model_dir, groundingdino_model_base_url
+            )
+            self.get_logger().info(
+                f"Model checkpoint downloaded {groundingdino_model_path}."
+            )
 
-        # Set the path to the GroundingDINO configurations file in the model directory 
+        # Set the path to the GroundingDINO configurations file in the model directory
         groundingdino_config_path = os.path.join(model_dir, groundingdino_config_name)
 
         # Subscribe to the camera info topic, to get the camera intrinsics
@@ -159,8 +164,10 @@ def __init__(self):
             callback_group=MutuallyExclusiveCallbackGroup(),
         )
 
-        # Initialize GroundingDINO 
-        self.initialize_grounding_dino(groundingdino_config_path, groundingdino_model_path)
+        # Initialize GroundingDINO
+        self.initialize_grounding_dino(
+            groundingdino_config_path, groundingdino_model_path
+        )
 
         # Initialize Segment Anything
         if self.use_efficient_sam:
@@ -353,8 +360,8 @@ def read_params(
                     ParameterDescriptor(
                         name="box_threshold",
                         type=ParameterType.PARAMETER_DOUBLE,
-                        description="The lower threshold for the bounding box detections" + 
-                                    "by GroundingDINO.",
+                        description="The lower threshold for the bounding box detections"
+                        + "by GroundingDINO.",
                         read_only=True,
                     ),
                 ),
@@ -364,8 +371,8 @@ def read_params(
                     ParameterDescriptor(
                         name="text_threshold",
                         type=ParameterType.PARAMETER_DOUBLE,
-                        description="The lower threshold for the text detections" +
-                                    "by GroundingDINO.",
+                        description="The lower threshold for the text detections"
+                        + "by GroundingDINO.",
                         read_only=True,
                     ),
                 ),
@@ -395,11 +402,11 @@ def read_params(
                     ParameterDescriptor(
                         name="viz_groundingdino",
                         type=ParameterType.PARAMETER_BOOL,
-                        description="Whether to visualize the bounding box" 
-                                    + "predictions of GroundingDINO.",
+                        description="Whether to visualize the bounding box"
+                        + "predictions of GroundingDINO.",
                         read_only=True,
                     ),
-                )
+                ),
             ],
         )
 
@@ -425,7 +432,7 @@ def read_params(
             max_depth_mm.value,
             viz_groundingdino.value,
         )
-    
+
     def initialize_grounding_dino(
         self, groundingdino_config_path: str, groundingdino_model_path: str
     ) -> None:
@@ -527,7 +534,7 @@ def initialize_efficient_sam(self, model_name: str, model_path: str) -> None:
         self.efficient_sam.to(device=self.device)
 
         self.get_logger().info("...Done!")
-    
+
     def camera_info_callback(self, msg: CameraInfo) -> None:
         """
         Store the latest camera info message.
@@ -560,7 +567,7 @@ def image_callback(self, msg: Union[Image, CompressedImage]) -> None:
         """
         with self.latest_img_msg_lock:
             self.latest_img_msg = msg
-    
+
     def goal_callback(self, goal_request: SegmentAllItems.Goal) -> GoalResponse:
         """
         Accept or reject the goal request based on the availability of the latest
@@ -585,24 +592,24 @@ def goal_callback(self, goal_request: SegmentAllItems.Goal) -> GoalResponse:
                     "Rejecting goal request because no depth image was received"
                 )
                 return GoalResponse.REJECT
-        
+
         # Accept the goal request is there isn't already an active one,
         # otherwise reject it
         with self.active_goal_request_lock:
             if self.active_goal_request is None:
                 self.get_logger().info("Accepting goal request")
-                self.active_goal_request = goal_request 
+                self.active_goal_request = goal_request
                 return GoalResponse.ACCEPT
             self.get_logger().info(
                 "Rejecting goal request because there is already an active one"
             )
             return GoalResponse.REJECT
-    
+
     def cancel_callback(self, _: ServerGoalHandle) -> CancelResponse:
         """
         Always accept the cancel request, however, 'execute_callback'
         will wait for segmentation to complete and not interrupt the process
-        in response to a cancel request. 
+        in response to a cancel request.
 
         Parameters
         ----------
@@ -610,12 +617,14 @@ def cancel_callback(self, _: ServerGoalHandle) -> CancelResponse:
         """
         self.get_logger().info("Cancelling the goal request...")
         return CancelResponse.ACCEPT
-    
-    def invoke_gpt4o_callback(self, request: GenerateCaption.Request, response: GenerateCaption.Response):
+
+    def invoke_gpt4o_callback(
+        self, request: GenerateCaption.Request, response: GenerateCaption.Response
+    ):
         """
-        Callback function for the GPT-4o service. This function takes in a list 
-        of string labels describing the foods on an image as input and returns a 
-        caption generated by GPT-4o that compiles these labels into a descriptive 
+        Callback function for the GPT-4o service. This function takes in a list
+        of string labels describing the foods on an image as input and returns a
+        caption generated by GPT-4o that compiles these labels into a descriptive
         sentence used as a query for GroundingDINO.
 
         Parameters
@@ -637,10 +646,10 @@ def invoke_gpt4o_callback(self, request: GenerateCaption.Request, response: Gene
         if latest_img_msg is None or camera_info is None:
             self.get_logger().error("Image or camera info not available.")
             return response
-        
+
         # Convert the image message to a CV2 image
         image = ros_msg_to_cv2_image(latest_img_msg)
-        
+
         # Run GPT-4o to generate a caption for the image
         vlm_query = self.run_gpt4o(image, request.input_labels)
         self.get_logger().info(f"GPT-4o Query: {vlm_query}")
@@ -656,13 +665,13 @@ def run_gpt4o(self, image: npt.NDArray, labels_list: list[str]):
 
         Parameters
         ----------
-        image: A CV2 above the plate image that is used as reference for GPT-4o 
+        image: A CV2 above the plate image that is used as reference for GPT-4o
                 to generate a caption input for GroundingDINO.
         labels_list: The list of food items to compile into a sentence prompt.
 
         Returns
         -------
-        vlm_query: The caption generated by GPT-4o that is used as text input for 
+        vlm_query: The caption generated by GPT-4o that is used as text input for
                    GroundingDINO.
         """
         self.get_logger().info("Running GPT-4o...")
@@ -705,21 +714,25 @@ def run_gpt4o(self, image: npt.NDArray, labels_list: list[str]):
             and artichoke on a plate.
         """
 
-        # Run GPT-4o to generate a sentence caption given the user query, system query, 
+        # Run GPT-4o to generate a sentence caption given the user query, system query,
         # and image prompt
         response = self.openai.chat.completions.create(
             model="gpt-4o-mini",
-            messages=[{"role": "system", "content": system_query}, 
-                      {"role": "user",
-                       "content": [
-                           {"type": "text", "text": user_query},
-                           {
-                               "type": "image_url",
-                               "image_url": {
-                                   "url": f"data:image/jpeg;base64,{image_base64}"
-                               }
-                           }
-                       ]}],
+            messages=[
+                {"role": "system", "content": system_query},
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": user_query},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{image_base64}"
+                            },
+                        },
+                    ],
+                },
+            ],
         )
 
         # Get the caption generated by GPT-4o
@@ -728,10 +741,10 @@ def run_gpt4o(self, image: npt.NDArray, labels_list: list[str]):
         return vlm_query
 
     def run_sam(
-        self, 
-        image: npt.NDArray, 
-        seed_point: Tuple[int, int], 
-        bbox: Tuple[int, int, int, int], 
+        self,
+        image: npt.NDArray,
+        seed_point: Tuple[int, int],
+        bbox: Tuple[int, int, int, int],
         prompt: int,
     ):
         """
@@ -741,8 +754,8 @@ def run_sam(
         ----------
         image: The image to perform segmentation on.
         seed_point: The seed point for SAM to segment from.
-        bbox: The bounding box prompt for SAM to segment. 
-        prompt: The prompt to use for SAM. If 0, use the seed point prompt. 
+        bbox: The bounding box prompt for SAM to segment.
+        prompt: The prompt to use for SAM. If 0, use the seed point prompt.
                 If 1, use the bounding box prompt.
 
         Returns
@@ -768,14 +781,14 @@ def run_sam(
                 box=bbox,
                 multimask_output=True,
             )
-        
+
         return masks, scores
-    
+
     def run_efficient_sam(
-        self, 
-        image: npt.NDArray, 
-        seed_point: Tuple[int, int], 
-        bbox: Tuple[int, int, int, int], 
+        self,
+        image: npt.NDArray,
+        seed_point: Tuple[int, int],
+        bbox: Tuple[int, int, int, int],
         prompt: int,
     ) -> Tuple[npt.NDArray, npt.NDArray]:
         """
@@ -785,8 +798,8 @@ def run_efficient_sam(
         ----------
         image: The image to perform segmentation on.
         seed_point: The seed point for EfficientSAM to segment from.
-        bbox: The bounding box prompt for EfficientSAM to segment. 
-        prompt: The prompt to use for EfficientSAM. If 0, use the seed point prompt. 
+        bbox: The bounding box prompt for EfficientSAM to segment.
+        prompt: The prompt to use for EfficientSAM. If 0, use the seed point prompt.
                 If 1, use the bounding box prompt.
 
         Returns
@@ -807,14 +820,18 @@ def run_efficient_sam(
             prompt_tensor = torch.tensor(np.array(seed_point).reshape((1, 1, 1, 2))).to(
                 device=self.device
             )
-            
+
             # Define the labels for the input prompt
             prompt_labels = torch.tensor([[[1]]]).to(device=self.device)
         else:
-            prompt_tensor = torch.reshape(torch.tensor(bbox), [1, 1, 2, 2]).to(self.device)
+            prompt_tensor = torch.reshape(torch.tensor(bbox), [1, 1, 2, 2]).to(
+                self.device
+            )
 
             # Define the labels for the input prompt
-            prompt_labels = torch.reshape(torch.tensor([2, 3]), [1, 1, 2]).to(self.device)
+            prompt_labels = torch.reshape(torch.tensor([2, 3]), [1, 1, 2]).to(
+                self.device
+            )
 
         # Run EfficientSAM on the image using the input prompt
         predicted_logits, predicted_iou = self.efficient_sam(
@@ -829,22 +846,22 @@ def run_efficient_sam(
         )
         masks = torch.ge(predicted_logits[0, 0, :, :, :], 0).cpu().detach().numpy()
         scores = predicted_iou[0, 0, :].cpu().detach().numpy()
-    
+
         return masks, scores
-    
+
     def run_grounding_dino(
-        self, 
-        image: npt.NDArray, 
-        caption: str, 
-        box_threshold: float, 
-        text_threshold: float, 
+        self,
+        image: npt.NDArray,
+        caption: str,
+        box_threshold: float,
+        text_threshold: float,
     ):
         """
         Run GroundingDINO on the image.
 
         Parameters
         ----------
-        image: The CV2 above the plate image to retrieve semantically labeled bounding 
+        image: The CV2 above the plate image to retrieve semantically labeled bounding
                boxes from.
         caption: The caption to use as text input for GroundingDINO.
         box_threshold: The threshold for the bounding box.
@@ -852,12 +869,12 @@ def run_grounding_dino(
 
         Returns
         -------
-        bbox_predictions: A dictionary containing the bounding boxes for each food item label 
+        bbox_predictions: A dictionary containing the bounding boxes for each food item label
                         detected from the image.
         """
         self.get_logger().info("Running GroundingDINO...")
-        
-        # Set the initial time to measure the elapsed time running GroundingDINO on the 
+
+        # Set the initial time to measure the elapsed time running GroundingDINO on the
         # desired image and text prompts.
         inference_time = time.time()
 
@@ -869,7 +886,7 @@ def run_grounding_dino(
 
         # Run GroundingDINO on the image using the input caption
         image_transformed = image_transformed.to(device=self.device)
-        #self.get_logger().info(f"device: {self.device}")
+        # self.get_logger().info(f"device: {self.device}")
         with torch.no_grad():
             outputs = self.groundingdino(
                 image_transformed[None],
@@ -878,7 +895,7 @@ def run_grounding_dino(
         logits = outputs["pred_logits"].sigmoid()[0]
         boxes = outputs["pred_boxes"][0]
         self.get_logger().info("... Done")
-        
+
         # Filter the output based on the box and text thresholds
         boxes_cxcywh = {}
         logits_filt = logits.cpu().clone()
@@ -887,9 +904,9 @@ def run_grounding_dino(
         logits_filt = logits_filt[filt_thresh_mask]
         boxes_filt = boxes_filt[filt_thresh_mask]
 
-        #self.get_logger().info(f"Caption: {caption}")
-        #self.get_logger().info(f"Boxes: {boxes_filt}")
-        #self.get_logger().info(f"Logits: {logits_filt}")
+        # self.get_logger().info(f"Caption: {caption}")
+        # self.get_logger().info(f"Boxes: {boxes_filt}")
+        # self.get_logger().info(f"Logits: {logits_filt}")
 
         # Tokenize the caption
         tokenizer = self.groundingdino.tokenizer
@@ -898,21 +915,22 @@ def run_grounding_dino(
         # Build the dictionary of bounding boxes for each food item label detected
         for logit, box in zip(logits_filt, boxes_filt):
             # Predict phrases based on the bounding boxes and the text threshold
-            phrase = get_phrases_from_posmap(logit > text_threshold, caption_tokens, tokenizer)
-            #self.get_logger().info(f"logit: {logit}, box: {box}")
-            #self.get_logger().info(f"{phrase}")
+            phrase = get_phrases_from_posmap(
+                logit > text_threshold, caption_tokens, tokenizer
+            )
+            # self.get_logger().info(f"logit: {logit}, box: {box}")
+            # self.get_logger().info(f"{phrase}")
             if phrase not in boxes_cxcywh:
                 boxes_cxcywh[phrase] = []
             boxes_cxcywh[phrase].append(box.cpu().numpy())
-        
 
         # Define height and width of image
         height, width, _ = image.shape
-        #self.get_logger().info(f"height, width: {height}, {width}")
+        # self.get_logger().info(f"height, width: {height}, {width}")
 
         # Convert the bounding boxes outputted by GroundingDINO to the following format
         # [top left x-value, top left y-value, bottom right x-value, bottom right y-value]
-        # and unnormalize the bounding box coordinate values 
+        # and unnormalize the bounding box coordinate values
         boxes_xyxy = {}
         for phrase, boxes in boxes_cxcywh.items():
             boxes_xyxy[phrase] = []
@@ -926,12 +944,12 @@ def run_grounding_dino(
                 x1 = x0 + w
                 y1 = y0 + h
                 boxes_xyxy[phrase].append([x0, y0, x1, y1])
-                
-        #self.get_logger().info(f"Predictions: {boxes_xyxy}")
-        
+
+        # self.get_logger().info(f"Predictions: {boxes_xyxy}")
+
         # Measure the elapsed time running GroundingDINO on the image prompt
         inference_time = int(round((time.time() - inference_time) * 1000))
-        #self.get_logger().info(f"Approx. Inference Time: {inference_time}")
+        # self.get_logger().info(f"Approx. Inference Time: {inference_time}")
 
         return boxes_xyxy
 
@@ -953,7 +971,7 @@ def load_image(self, image_array: npt.NDArray):
 
         # Convert image to image pillow to apply transformation
         image_pil = ImagePIL.fromarray(image_array, mode="RGB")
-        #image_pil.show()
+        # image_pil.show()
         transform = T.Compose(
             [
                 T.RandomResize([800], max_size=1333),
@@ -962,21 +980,20 @@ def load_image(self, image_array: npt.NDArray):
             ]
         )
         image, _ = transform(image_pil, None)  # 3, h, w
-        
+
         return image_pil, image
-    
+
     def generate_mask_msg(
-        self, 
-        item_id: str, 
+        self,
+        item_id: str,
         object_id: str,
-        score: float, 
-        mask:npt.NDArray[np.bool_], 
-        image: npt.NDArray, 
-        depth_img: npt.NDArray, 
+        score: float,
+        mask: npt.NDArray[np.bool_],
+        image: npt.NDArray,
+        depth_img: npt.NDArray,
         bbox: Tuple[int, int, int, int],
     ) -> Optional[Mask]:
-        """
-        """
+        """ """
         # Calculate center of the bounding box and use it as the seed point for
         # getting the connected component of the mask
         center_x = (bbox[0] + bbox[2]) // 2
@@ -985,9 +1002,9 @@ def generate_mask_msg(
         center_y = int(center_y)
         seed_point = (center_x, center_y)
 
-        # Use the mask to get a connected component containing the seed point 
+        # Use the mask to get a connected component containing the seed point
         cleaned_mask = get_connected_component(mask, seed_point)
-        
+
         # Use the cleaned mask to calculate the median depth over the mask
         masked_depth = depth_img[cleaned_mask]
         median_depth_mm = np.median(
@@ -1006,8 +1023,10 @@ def generate_mask_msg(
             return None
 
         # Convert the bounding box from a Python Tuple into the BoundingBox class
-        # from helpers.py to call the crop_image_mask_and_point 
-        bbox_converted = BoundingBox(int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]))
+        # from helpers.py to call the crop_image_mask_and_point
+        bbox_converted = BoundingBox(
+            int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
+        )
 
         # Crop the image and the mask
         cropped_image, cropped_mask, _ = crop_image_mask_and_point(
@@ -1038,10 +1057,10 @@ def generate_mask_msg(
         mask_msg.confidence = float(score)
 
         return mask_msg
-    
+
     def visualize_groundingdino_results(self, image: Image, predictions: dict):
         """
-        Visualizes the bounding box predictions of GroundingDINO and then 
+        Visualizes the bounding box predictions of GroundingDINO and then
         publishes the image as a ROS message.
 
         Parameters
@@ -1054,19 +1073,29 @@ def visualize_groundingdino_results(self, image: Image, predictions: dict):
 
         # Define height of image
         height, _, _ = image_copy.shape
-        
+
         for phrase, boxes in predictions.items():
             for box in boxes:
                 x0, y0, x1, y1 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
-                #self.get_logger().info(f"box: {x0}, {y0}, {x1}, {y1}")
+                # self.get_logger().info(f"box: {x0}, {y0}, {x1}, {y1}")
                 color = (0, 255, 0)
                 thickness = 6
-                image_copy = cv2.rectangle(image_copy, (x0, y0), (x1, y1), color, thickness)
+                image_copy = cv2.rectangle(
+                    image_copy, (x0, y0), (x1, y1), color, thickness
+                )
 
                 # Display text label below bounding box
-                image_copy = cv2.putText(image_copy, phrase, (x0, y0 - 12), 0, 1e-3 * height, color, thickness // 3)
+                image_copy = cv2.putText(
+                    image_copy,
+                    phrase,
+                    (x0, y0 - 12),
+                    0,
+                    1e-3 * height,
+                    color,
+                    thickness // 3,
+                )
 
-        cv2.imshow('image', image_copy)
+        cv2.imshow("image", image_copy)
         cv2.waitKey(0)
         cv2.destroyAllWindows()
         # Publish the image
@@ -1074,7 +1103,9 @@ def visualize_groundingdino_results(self, image: Image, predictions: dict):
             cv2_image_to_ros_msg(image_copy, compress=False, bridge=self.bridge)
         )
 
-    def display_masks(self, image: npt.NDArray, masks: list[npt.NDArray], item_labels: list[str]):
+    def display_masks(
+        self, image: npt.NDArray, masks: list[npt.NDArray], item_labels: list[str]
+    ):
         """
         Display the masks on the image.
 
@@ -1099,16 +1130,15 @@ def display_masks(self, image: npt.NDArray, masks: list[npt.NDArray], item_label
             cv2.imshow(label, mask)
             cv2.waitKey(0)
             cv2.destroyAllWindows()
-            #self.get_logger().info(f"Mask max: {np.max(mask)}")
-            #image_copy = cv2.addWeighted(image, 1.0, mask, 0.3, 0)
+            # self.get_logger().info(f"Mask max: {np.max(mask)}")
+            # image_copy = cv2.addWeighted(image, 1.0, mask, 0.3, 0)
 
-        #cv2.imshow("Masks", image)
-        #cv2.waitKey(0)
-        #cv2.destroyAllWindows()
+        # cv2.imshow("Masks", image)
+        # cv2.waitKey(0)
+        # cv2.destroyAllWindows()
 
     def display_mask(self, image: npt.NDArray, mask: npt.NDArray):
-        """
-        """
+        """ """
         image_copy = deepcopy(image)
         mask_int = mask.astype(np.uint8)
         mask_int = cv2.resize(mask_int, (image.shape[1], image.shape[0]))
@@ -1116,23 +1146,23 @@ def display_mask(self, image: npt.NDArray, mask: npt.NDArray):
         mask_int = np.stack([mask_int] * color_dims, axis=-1)
         color_scalar = random.randint(80, 255)
         mask_int = np.multiply(mask_int, color_scalar)
-        cv2.imshow('mask', mask_int)
+        cv2.imshow("mask", mask_int)
         cv2.waitKey(0)
         cv2.destroyAllWindows()
-    
+
     async def run_vision_pipeline(self, image_msg: Image, caption: str):
         """
-        Run the vision pipeline consisting of two foundation models, GroundingDINO and 
-        EfficientSAM, on the image. GroundingDINO is prompted with a caption and the latest image,  
-        and outputs bounding boxes for each semantic label in the caption detected in the 
-        image. The detected items are then segmented by passing in the bounding box detections 
-        into EfficientSAM which outputs pixel-wise masks for each bounding box. The top masks 
-        are then returned along with the semantic label for each mask as a dictionary. 
+        Run the vision pipeline consisting of two foundation models, GroundingDINO and
+        EfficientSAM, on the image. GroundingDINO is prompted with a caption and the latest image,
+        and outputs bounding boxes for each semantic label in the caption detected in the
+        image. The detected items are then segmented by passing in the bounding box detections
+        into EfficientSAM which outputs pixel-wise masks for each bounding box. The top masks
+        are then returned along with the semantic label for each mask as a dictionary.
 
         Parameters
         ----------
         image: The image to segment, as a ROS image message.
-        caption: The caption to use for GroundingDINO containing all the food items 
+        caption: The caption to use for GroundingDINO containing all the food items
                  detected in the image.
 
         Returns
@@ -1142,7 +1172,7 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         """
         self.get_logger().info("Running the vision pipeline...")
 
-        # Set the initial time to measure the elapsed time running GroundingDINO on the 
+        # Set the initial time to measure the elapsed time running GroundingDINO on the
         # desired image and text prompts.
         inference_time = time.time()
 
@@ -1161,20 +1191,22 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         with self.latest_depth_img_msg_lock:
             depth_img_msg = self.latest_depth_img_msg
         depth_img = ros_msg_to_cv2_image(depth_img_msg, self.bridge)
-        
-        # Convert the image to OpenCV format 
+
+        # Convert the image to OpenCV format
         image = ros_msg_to_cv2_image(image_msg, self.bridge)
 
         # Run GroundingDINO on the image
-        bbox_predictions = self.run_grounding_dino(image, caption, self.box_threshold, self.text_threshold)
+        bbox_predictions = self.run_grounding_dino(
+            image, caption, self.box_threshold, self.text_threshold
+        )
 
         # Publish a visualization of the GroundingDINO predictions, if the visualization
-        # flag is set to true 
+        # flag is set to true
         if self.viz_groundingdino:
             self.visualize_groundingdino_results(image, bbox_predictions)
 
-        # Collect the top contender mask for each food item label detected by 
-        # GroundingDINO using EfficientSAM and create dictionary of mask 
+        # Collect the top contender mask for each food item label detected by
+        # GroundingDINO using EfficientSAM and create dictionary of mask
         # predictions from the pipeline
         detected_items = []
         item_labels = []
@@ -1184,7 +1216,7 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
             for box in boxes:
                 masks, scores = self.run_efficient_sam(image, None, box, 1)
                 if len(masks) > 0:
-                    #self.get_logger().info(f"Mask: {masks[0]}")
+                    # self.get_logger().info(f"Mask: {masks[0]}")
                     masks_list.append(masks[0])
                     item_id = f"food_id_{mask_num:d}"
                     mask_num += 1
@@ -1193,18 +1225,20 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
                     )
                     detected_items.append(mask_msg)
                     item_labels.append(phrase)
-                    
-        #self.display_masks(image, masks_list, item_labels)
-        #self.get_logger().info(f"Detected items: {detected_items}")
+
+        # self.display_masks(image, masks_list, item_labels)
+        # self.get_logger().info(f"Detected items: {detected_items}")
         self.get_logger().info(f"Item_labels: {item_labels}")
         result.detected_items = detected_items
         result.item_labels = item_labels
 
         # Measure the elapsed time running GroundingDINO on the image prompt
         inference_time = int(round((time.time() - inference_time) * 1000))
-        self.get_logger().info(f"VISION PIPELINE - Approx. Inference Time: {inference_time}")
+        self.get_logger().info(
+            f"VISION PIPELINE - Approx. Inference Time: {inference_time}"
+        )
 
-        return result 
+        return result
 
     async def execute_callback(
         self, goal_handle: ServerGoalHandle
@@ -1245,16 +1279,16 @@ async def execute_callback(
             self.run_vision_pipeline, latest_img_msg, caption
         )
 
-        # Wait for the vision pipeline to finish and keep publishing 
+        # Wait for the vision pipeline to finish and keep publishing
         # feedback (elapsed time) while waiting
         feedback = SegmentAllItems.Feedback()
         while (
-            rclpy.ok() 
+            rclpy.ok()
             and not goal_handle.is_cancel_requested
             and not vision_pipeline_task.done()
         ):
             feedback.elapsed_time = (self.get_clock().now() - starting_time).to_msg()
-            goal_handle.publish_feedback(feedback) 
+            goal_handle.publish_feedback(feedback)
             rate.sleep()
 
         # If there is a cancel request, cancel the vision pipeline task
@@ -1269,8 +1303,8 @@ async def execute_callback(
                 self.active_goal_request = None
 
             return result
-        
-        # Set the result after the task has been completed        
+
+        # Set the result after the task has been completed
         self.get_logger().info("Goal not cancelled.")
         self.get_logger().info("VIsion pipeline completed successfully.")
         result = vision_pipeline_task.result()
@@ -1284,7 +1318,6 @@ async def execute_callback(
         return result
 
 
-    
 def main(args=None):
     """
     Launch the ROS node and spin.

From 23577ae9a3c494b2c991636ba00485135e0e34c6 Mon Sep 17 00:00:00 2001
From: sriramk117 <sriramku@outlook.com>
Date: Fri, 8 Nov 2024 18:54:30 -0800
Subject: [PATCH 29/42] changes to segmentallitems node initializing it as a
 perception node

---
 ada_feeding_msgs/CMakeLists.txt               |   1 -
 .../ada_feeding_perception_node.py            |   2 +
 .../segment_all_items.py                      | 231 ++++++++++--------
 .../launch/ada_feeding_perception.launch.py   |   2 +
 4 files changed, 130 insertions(+), 106 deletions(-)

diff --git a/ada_feeding_msgs/CMakeLists.txt b/ada_feeding_msgs/CMakeLists.txt
index aba822a4..6693e79f 100644
--- a/ada_feeding_msgs/CMakeLists.txt
+++ b/ada_feeding_msgs/CMakeLists.txt
@@ -35,7 +35,6 @@ rosidl_generate_interfaces(${PROJECT_NAME}
   "srv/AcquisitionReport.srv"
   "srv/AcquisitionSelect.srv"
   "srv/GetRobotState.srv"
-  "srv/ToggleFaceDetection.srv"
   "srv/GenerateCaption.srv"
   "srv/ModifyCollisionObject.srv"
 
diff --git a/ada_feeding_perception/ada_feeding_perception/ada_feeding_perception_node.py b/ada_feeding_perception/ada_feeding_perception/ada_feeding_perception_node.py
index f891de5f..b65ae163 100755
--- a/ada_feeding_perception/ada_feeding_perception/ada_feeding_perception_node.py
+++ b/ada_feeding_perception/ada_feeding_perception/ada_feeding_perception_node.py
@@ -171,6 +171,7 @@ def main(args=None):
     from ada_feeding_perception.face_detection import FaceDetectionNode
     from ada_feeding_perception.food_on_fork_detection import FoodOnForkDetectionNode
     from ada_feeding_perception.segment_from_point import SegmentFromPointNode
+    from ada_feeding_perception.segment_all_items import SegmentAllItemsNode
     from ada_feeding_perception.table_detection import TableDetectionNode
 
     rclpy.init(args=args)
@@ -179,6 +180,7 @@ def main(args=None):
     face_detection = FaceDetectionNode(node)
     food_on_fork_detection = FoodOnForkDetectionNode(node)
     segment_from_point = SegmentFromPointNode(node)  # pylint: disable=unused-variable
+    segment_all_items = SegmentAllItemsNode(node)  # pylint: disable=unused-variable
     table_detection = TableDetectionNode(node)
     executor = MultiThreadedExecutor(num_threads=16)
 
diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index f2508e29..b67d2187 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -26,6 +26,7 @@
 from rclpy.executors import MultiThreadedExecutor
 from rclpy.node import Node
 from rclpy.parameter import Parameter
+from rclpy.qos import QoSProfile, ReliabilityPolicy
 from segment_anything import sam_model_registry, SamPredictor
 from groundingdino.models import build_model
 from groundingdino.util.slconfig import SLConfig
@@ -54,7 +55,7 @@
     get_img_msg_type,
     ros_msg_to_cv2_image,
 )
-
+from ada_feeding_perception.ada_feeding_perception_node import ADAFeedingPerceptionNode
 
 class SegmentAllItemsNode(Node):
     """
@@ -63,11 +64,17 @@ class SegmentAllItemsNode(Node):
     label using GPT-4o, GroundingDINO, and SegmentAnything.
     """
 
-    def __init__(self):
+    def __init__(self, node: ADAFeedingPerceptionNode):
         """
         Initialize the SegmentAllItemsNode.
-        """
 
+        Parameters
+        ----------
+        node: The ADAFeedingPerceptionNode.
+              The node that contains all functionality to get camera images (RGB and depth)
+              and camera info.
+        """
+        self._node = node
         super().__init__("segment_all_items")
 
         # Check if cuda is available
@@ -96,18 +103,18 @@ def __init__(self):
         # Download the checkpoint for SAM/EfficientSAM if it doesn't exist
         seg_model_path = os.path.join(model_dir, seg_model_name)
         if not os.path.isfile(seg_model_path):
-            self.get_logger().info("Model checkpoint does not exist. Downloading...")
+            self._node.get_logger().info("Model checkpoint does not exist. Downloading...")
             download_checkpoint(seg_model_name, model_dir, seg_model_base_url)
-            self.get_logger().info(f"Model checkpoint downloaded {seg_model_path}.")
+            self._node.get_logger().info(f"Model checkpoint downloaded {seg_model_path}.")
 
         # Download the checkpoint for GroundingDINO if it doesn't exist
         groundingdino_model_path = os.path.join(model_dir, groundingdino_model_name)
         if not os.path.isfile(groundingdino_model_path):
-            self.get_logger().info("Model checkpoint does not exist. Downloading...")
+            self._node.get_logger().info("Model checkpoint does not exist. Downloading...")
             download_checkpoint(
                 groundingdino_model_name, model_dir, groundingdino_model_base_url
             )
-            self.get_logger().info(
+            self._node.get_logger().info(
                 f"Model checkpoint downloaded {groundingdino_model_path}."
             )
 
@@ -115,52 +122,46 @@ def __init__(self):
         groundingdino_config_path = os.path.join(model_dir, groundingdino_config_name)
 
         # Subscribe to the camera info topic, to get the camera intrinsics
+        self.camera_info_topic = "~/camera_info"
         self.camera_info = None
-        self.camera_info_lock = threading.Lock()
-        self.camera_info_subscriber = self.create_subscription(
+        self._node.add_subscription(
             CameraInfo,
-            "~/camera_info",
-            self.camera_info_callback,
-            1,
+            self.camera_info_topic,
+            QoSProfile(depth=1, reliability=ReliabilityPolicy.RELIABLE),
             callback_group=MutuallyExclusiveCallbackGroup(),
         )
 
         # Subscribe to the aligned depth image topic, to store the latest depth image
         # NOTE: We assume this is in the same frame as the RGB image
-        self.latest_depth_img_msg = None
-        self.latest_depth_img_msg_lock = threading.Lock()
-        aligned_depth_topic = "~/aligned_depth"
+        self.aligned_depth_topic = "~/aligned_depth"
         try:
-            aligned_depth_type = get_img_msg_type(aligned_depth_topic, self)
+            aligned_depth_type = get_img_msg_type(self.aligned_depth_topic, self._node)
         except ValueError as err:
-            self.get_logger().error(
-                f"Error getting type of depth image topic. Defaulting to CompressedImage. {err}"
+            self._node.get_logger().error(
+                f"Error getting type of depth image topic. Defaulting to Image. {err}"
             )
-            aligned_depth_type = CompressedImage
-        self.depth_image_subscriber = self.create_subscription(
+            aligned_depth_type = Image
+        # Subscribe to the depth image
+        self._node.add_subscription(
             aligned_depth_type,
-            aligned_depth_topic,
-            self.depth_image_callback,
-            1,
+            self.aligned_depth_topic,
+            QoSProfile(depth=1, reliability=ReliabilityPolicy.RELIABLE),
             callback_group=MutuallyExclusiveCallbackGroup(),
         )
 
         # Subscribe to the RGB image topic, to store the latest image
-        self.latest_img_msg = None
-        self.latest_img_msg_lock = threading.Lock()
-        image_topic = "~/image"
+        self.rgb_image_topic = "~/image"
         try:
-            image_type = get_img_msg_type(image_topic, self)
+            image_type = get_img_msg_type(self.rgb_image_topic, self._node)
         except ValueError as err:
-            self.get_logger().error(
+            self._node.get_logger().error(
                 f"Error getting type of image topic. Defaulting to CompressedImage. {err}"
             )
             image_type = CompressedImage
-        self.image_subscriber = self.create_subscription(
+        self._node.add_subscription(
             image_type,
-            image_topic,
-            self.image_callback,
-            1,
+            self.rgb_image_topic,
+            QoSProfile(depth=1, reliability=ReliabilityPolicy.RELIABLE),
             callback_group=MutuallyExclusiveCallbackGroup(),
         )
 
@@ -444,7 +445,7 @@ def initialize_grounding_dino(
         groundingdino_config_path: The path to the GroundingDINO configuration file.
         groundingdino_model_path: The path to the GroundingDINO model checkpoint.
         """
-        self.get_logger().info("Initializing GroundingDINO...")
+        self._node.get_logger().info("Initializing GroundingDINO...")
 
         # Get model configuration arguments from the configuration file
         config_args = SLConfig.fromfile(groundingdino_config_path)
@@ -456,11 +457,11 @@ def initialize_grounding_dino(
         load_log = groundingdino.load_state_dict(
             clean_state_dict(checkpoint["model"]), strict=False
         )
-        self.get_logger().info(f"Loaded model checkpoint: {load_log}")
+        self._node.get_logger().info(f"Loaded model checkpoint: {load_log}")
         _ = groundingdino.eval()
         self.groundingdino = groundingdino.to(device=self.device)
 
-        self.get_logger().info("...Done!")
+        self._node.get_logger().info("...Done!")
 
     def initialize_sam(self, model_name: str, model_path: str) -> None:
         """
@@ -479,7 +480,7 @@ def initialize_sam(self, model_name: str, model_path: str) -> None:
         ------
         ValueError if the model name does not contain vit_h, vit_l, or vit_b
         """
-        self.get_logger().info("Initializing SAM...")
+        self._node.get_logger().info("Initializing SAM...")
         # Load the model and move it to the specified device
         if "vit_b" in model_name:  # base model
             model_type = "vit_b"
@@ -497,7 +498,7 @@ def initialize_sam(self, model_name: str, model_path: str) -> None:
         # a lock.
         self.sam = SamPredictor(sam)
 
-        self.get_logger().info("...Done!")
+        self._node.get_logger().info("...Done!")
 
     def initialize_efficient_sam(self, model_name: str, model_path: str) -> None:
         """
@@ -516,7 +517,7 @@ def initialize_efficient_sam(self, model_name: str, model_path: str) -> None:
         ------
         ValueError if the model name does not contain efficient_sam
         """
-        self.get_logger().info("Initializing EfficientSAM...")
+        self._node.get_logger().info("Initializing EfficientSAM...")
         # Hardcoded from https://github.com/yformer/EfficientSAM/blob/main/efficient_sam/build_efficient_sam.py
         if "vits" in model_name:
             encoder_patch_embed_dim = 384
@@ -533,7 +534,7 @@ def initialize_efficient_sam(self, model_name: str, model_path: str) -> None:
         ).eval()
         self.efficient_sam.to(device=self.device)
 
-        self.get_logger().info("...Done!")
+        self._node.get_logger().info("...Done!")
 
     def camera_info_callback(self, msg: CameraInfo) -> None:
         """
@@ -578,29 +579,30 @@ def goal_callback(self, goal_request: SegmentAllItems.Goal) -> GoalResponse:
         goal_request: The goal request.
         """
         # If no RGB image is received, reject the goal request
-        with self.latest_img_msg_lock:
-            if self.latest_img_msg is None:
-                self.get_logger().info(
-                    "Rejecting goal request because no color image was received"
-                )
-                return GoalResponse.REJECT
+        self._node.get_logger().info("Received goal request...")
+        latest_rgb_img_msg = None
+        if latest_rgb_img_msg is None:
+            self._node.get_logger().info(
+                "Rejecting goal request because no color image was received"
+            )
+            return GoalResponse.REJECT
 
         # If no depth image is received, reject the goal request
-        with self.latest_depth_img_msg_lock:
-            if self.latest_depth_img_msg is None:
-                self.get_logger().info(
-                    "Rejecting goal request because no depth image was received"
-                )
-                return GoalResponse.REJECT
+        latest_depth_img_msg = self._node.get_latest_msg(self.aligned_depth_topic)
+        if latest_depth_img_msg is None:
+            self._node.get_logger().info(
+                "Rejecting goal request because no depth image was received"
+            )
+            return GoalResponse.REJECT
 
         # Accept the goal request is there isn't already an active one,
         # otherwise reject it
         with self.active_goal_request_lock:
             if self.active_goal_request is None:
-                self.get_logger().info("Accepting goal request")
+                self._node.get_logger().info("Accepting goal request")
                 self.active_goal_request = goal_request
                 return GoalResponse.ACCEPT
-            self.get_logger().info(
+            self._node.get_logger().info(
                 "Rejecting goal request because there is already an active one"
             )
             return GoalResponse.REJECT
@@ -615,7 +617,7 @@ def cancel_callback(self, _: ServerGoalHandle) -> CancelResponse:
         ----------
         goal_handle: The goal handle.
         """
-        self.get_logger().info("Cancelling the goal request...")
+        self._node.get_logger().info("Cancelling the goal request...")
         return CancelResponse.ACCEPT
 
     def invoke_gpt4o_callback(
@@ -637,14 +639,17 @@ def invoke_gpt4o_callback(
         response: The updated response message based on the request.
         """
         # Get the latest image and camera info
-        with self.latest_img_msg_lock:
-            latest_img_msg = self.latest_img_msg
-        with self.camera_info_lock:
+        latest_img_msg = self._node.get_latest_msg(self.rgb_image_topic)
+        if self.camera_info is None:
+            self.camera_info = self._node.get_latest_msg(self.camera_info_topic)
+        if self.camera_info is not None:
             camera_info = self.camera_info
+        else:
+            camera_info = None
 
         # Check if the image and camera info are available
         if latest_img_msg is None or camera_info is None:
-            self.get_logger().error("Image or camera info not available.")
+            self._node.get_logger().error("Image or camera info not available.")
             return response
 
         # Convert the image message to a CV2 image
@@ -652,7 +657,7 @@ def invoke_gpt4o_callback(
 
         # Run GPT-4o to generate a caption for the image
         vlm_query = self.run_gpt4o(image, request.input_labels)
-        self.get_logger().info(f"GPT-4o Query: {vlm_query}")
+        self._node.get_logger().info(f"GPT-4o Query: {vlm_query}")
 
         # Set the response message to the caption generated by GPT-4o
         response.caption = vlm_query
@@ -674,7 +679,7 @@ def run_gpt4o(self, image: npt.NDArray, labels_list: list[str]):
         vlm_query: The caption generated by GPT-4o that is used as text input for
                    GroundingDINO.
         """
-        self.get_logger().info("Running GPT-4o...")
+        self._node.get_logger().info("Running GPT-4o...")
 
         # Encode the image to JPEG format
         _, buffer = cv2.imencode(".jpg", image)
@@ -763,7 +768,7 @@ def run_sam(
         masks: The masks for each segmentation.
         scores: The confidence scores for each segmentation.
         """
-        self.get_logger().info("Segmenting image with SAM...")
+        self._node.get_logger().info("Segmenting image with SAM...")
 
         # Convert image from BGR to RGB for Segment Anything
         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
@@ -807,7 +812,7 @@ def run_efficient_sam(
         masks: The masks for each segmentation.
         scores: The confidence scores for each segmentation.
         """
-        self.get_logger().info("Segmenting image with EfficientSAM...")
+        self._node.get_logger().info("Segmenting image with EfficientSAM...")
 
         # Convert image from BGR to RGB for Segment Anything
         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
@@ -872,7 +877,7 @@ def run_grounding_dino(
         bbox_predictions: A dictionary containing the bounding boxes for each food item label
                         detected from the image.
         """
-        self.get_logger().info("Running GroundingDINO...")
+        self._node.get_logger().info("Running GroundingDINO...")
 
         # Set the initial time to measure the elapsed time running GroundingDINO on the
         # desired image and text prompts.
@@ -886,7 +891,7 @@ def run_grounding_dino(
 
         # Run GroundingDINO on the image using the input caption
         image_transformed = image_transformed.to(device=self.device)
-        # self.get_logger().info(f"device: {self.device}")
+        # self._node.get_logger().info(f"device: {self.device}")
         with torch.no_grad():
             outputs = self.groundingdino(
                 image_transformed[None],
@@ -894,7 +899,7 @@ def run_grounding_dino(
             )
         logits = outputs["pred_logits"].sigmoid()[0]
         boxes = outputs["pred_boxes"][0]
-        self.get_logger().info("... Done")
+        self._node.get_logger().info("... Done")
 
         # Filter the output based on the box and text thresholds
         boxes_cxcywh = {}
@@ -904,9 +909,9 @@ def run_grounding_dino(
         logits_filt = logits_filt[filt_thresh_mask]
         boxes_filt = boxes_filt[filt_thresh_mask]
 
-        # self.get_logger().info(f"Caption: {caption}")
-        # self.get_logger().info(f"Boxes: {boxes_filt}")
-        # self.get_logger().info(f"Logits: {logits_filt}")
+        # self._node.get_logger().info(f"Caption: {caption}")
+        # self._node.get_logger().info(f"Boxes: {boxes_filt}")
+        # self._node.get_logger().info(f"Logits: {logits_filt}")
 
         # Tokenize the caption
         tokenizer = self.groundingdino.tokenizer
@@ -918,15 +923,15 @@ def run_grounding_dino(
             phrase = get_phrases_from_posmap(
                 logit > text_threshold, caption_tokens, tokenizer
             )
-            # self.get_logger().info(f"logit: {logit}, box: {box}")
-            # self.get_logger().info(f"{phrase}")
+            # self._node.get_logger().info(f"logit: {logit}, box: {box}")
+            # self._node.get_logger().info(f"{phrase}")
             if phrase not in boxes_cxcywh:
                 boxes_cxcywh[phrase] = []
             boxes_cxcywh[phrase].append(box.cpu().numpy())
 
         # Define height and width of image
         height, width, _ = image.shape
-        # self.get_logger().info(f"height, width: {height}, {width}")
+        # self._node.get_logger().info(f"height, width: {height}, {width}")
 
         # Convert the bounding boxes outputted by GroundingDINO to the following format
         # [top left x-value, top left y-value, bottom right x-value, bottom right y-value]
@@ -945,11 +950,11 @@ def run_grounding_dino(
                 y1 = y0 + h
                 boxes_xyxy[phrase].append([x0, y0, x1, y1])
 
-        # self.get_logger().info(f"Predictions: {boxes_xyxy}")
+        # self._node.get_logger().info(f"Predictions: {boxes_xyxy}")
 
         # Measure the elapsed time running GroundingDINO on the image prompt
         inference_time = int(round((time.time() - inference_time) * 1000))
-        # self.get_logger().info(f"Approx. Inference Time: {inference_time}")
+        # self._node.get_logger().info(f"Approx. Inference Time: {inference_time}")
 
         return boxes_xyxy
 
@@ -993,7 +998,19 @@ def generate_mask_msg(
         depth_img: npt.NDArray,
         bbox: Tuple[int, int, int, int],
     ) -> Optional[Mask]:
-        """ """
+        """ 
+        Convert a mask detected by EfficientSAM or SAM into a ROS Mask message.
+
+        Parameters
+        ----------
+        item_id: The item ID of the mask.
+        object_id: The object ID of the mask.
+        score: The confidence score outputed by the segmentation model for the mask.
+        mask: The pixel-wise mask detected.
+        image: The image the mask was detected on.
+        depth_img: The most recent depth image.
+        bbox: The bounding box from GroundingDINO.
+        """
         # Calculate center of the bounding box and use it as the seed point for
         # getting the connected component of the mask
         center_x = (bbox[0] + bbox[2]) // 2
@@ -1016,7 +1033,7 @@ def generate_mask_msg(
         )
         # If the depth is invalid, skip this mask and return None
         if np.isnan(median_depth_mm):
-            self.get_logger().warn(
+            self._node.get_logger().warn(
                 f"No depth points within [{self.min_depth_mm}, {self.max_depth_mm}] mm range "
                 f"for mask {item_id}. Skipping mask."
             )
@@ -1077,7 +1094,7 @@ def visualize_groundingdino_results(self, image: Image, predictions: dict):
         for phrase, boxes in predictions.items():
             for box in boxes:
                 x0, y0, x1, y1 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
-                # self.get_logger().info(f"box: {x0}, {y0}, {x1}, {y1}")
+                # self._node.get_logger().info(f"box: {x0}, {y0}, {x1}, {y1}")
                 color = (0, 255, 0)
                 thickness = 6
                 image_copy = cv2.rectangle(
@@ -1114,7 +1131,7 @@ def display_masks(
         image: The image to display the masks on.
         masks: The masks to display on the image.
         """
-        self.get_logger().info("Displaying masks...")
+        self._node.get_logger().info("Displaying masks...")
 
         # Create a deep copy of the image to visualize
         image_copy = deepcopy(image)
@@ -1130,7 +1147,7 @@ def display_masks(
             cv2.imshow(label, mask)
             cv2.waitKey(0)
             cv2.destroyAllWindows()
-            # self.get_logger().info(f"Mask max: {np.max(mask)}")
+            # self._node.get_logger().info(f"Mask max: {np.max(mask)}")
             # image_copy = cv2.addWeighted(image, 1.0, mask, 0.3, 0)
 
         # cv2.imshow("Masks", image)
@@ -1170,7 +1187,7 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         result: The result message containing masks for all food items detected in the image
                 paired with semantic labels.
         """
-        self.get_logger().info("Running the vision pipeline...")
+        self._node.get_logger().info("Running the vision pipeline...")
 
         # Set the initial time to measure the elapsed time running GroundingDINO on the
         # desired image and text prompts.
@@ -1179,17 +1196,17 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         # Define the result and create result message header
         result = SegmentAllItems.Result()
         result.header = image_msg.header
-        with self.camera_info_lock:
-            if self.camera_info is not None:
-                result.camera_info = self.camera_info
-            else:
-                self.get_logger().warn(
-                    "Camera info not received, not including in result message"
-                )
+        if self.camera_info is None:
+            self.camera_info = self._node.get_latest_msg(self.camera_info_topic)
+        if self.camera_info is not None:
+            result.camera_info = self.camera_info
+        else:
+            self._node.get_logger().warn(
+                "Camera info not received, not including in result message"
+            )
 
         # Get the latest depth image and convert the depth image to OpenCV format
-        with self.latest_depth_img_msg_lock:
-            depth_img_msg = self.latest_depth_img_msg
+        depth_img_msg = self._node.get_latest_msg(self.aligned_depth_topic)
         depth_img = ros_msg_to_cv2_image(depth_img_msg, self.bridge)
 
         # Convert the image to OpenCV format
@@ -1216,7 +1233,7 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
             for box in boxes:
                 masks, scores = self.run_efficient_sam(image, None, box, 1)
                 if len(masks) > 0:
-                    # self.get_logger().info(f"Mask: {masks[0]}")
+                    # self._node.get_logger().info(f"Mask: {masks[0]}")
                     masks_list.append(masks[0])
                     item_id = f"food_id_{mask_num:d}"
                     mask_num += 1
@@ -1227,14 +1244,14 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
                     item_labels.append(phrase)
 
         # self.display_masks(image, masks_list, item_labels)
-        # self.get_logger().info(f"Detected items: {detected_items}")
-        self.get_logger().info(f"Item_labels: {item_labels}")
+        # self._node.get_logger().info(f"Detected items: {detected_items}")
+        self._node.get_logger().info(f"Item_labels: {item_labels}")
         result.detected_items = detected_items
         result.item_labels = item_labels
 
         # Measure the elapsed time running GroundingDINO on the image prompt
         inference_time = int(round((time.time() - inference_time) * 1000))
-        self.get_logger().info(
+        self._node.get_logger().info(
             f"VISION PIPELINE - Approx. Inference Time: {inference_time}"
         )
 
@@ -1255,26 +1272,29 @@ async def execute_callback(
         result: The result message containing masks for all food items detected in the image
                 paired with semantic labels.
         """
-        starting_time = self.get_clock().now()
-        self.get_logger().info("Received a new goal!")
+        self._node.get_logger().info("Received a new goal!")
+        starting_time = self._node.get_clock().now()
 
         # Get the latest image and camera info
-        with self.latest_img_msg_lock:
-            latest_img_msg = self.latest_img_msg
-        with self.camera_info_lock:
+        latest_img_msg = self._node.get_latest_msg(self.rgb_image_topic)
+        if self.camera_info is None:
+            self.camera_info = self._node.get_latest_msg(self.camera_info_topic)
+        if self.camera_info is not None:
             camera_info = self.camera_info
+        else:
+            camera_info = None
 
         # Check if the image and camera info are available
         if latest_img_msg is None or camera_info is None:
-            self.get_logger().error("Image or camera info not available.")
+            self._node.get_logger().error("Image or camera info not available.")
             return SegmentAllItems.Result()
 
         # Get the caption from the goal request
         caption = goal_handle.request.caption
-        self.get_logger().info(f"caption: {caption}")
+        self._node.get_logger().info(f"caption: {caption}")
 
         # Start running the vision pipeline as a separate thread
-        rate = self.create_rate(self.rate_hz)
+        rate = self._node.create_rate(self.rate_hz)
         vision_pipeline_task = self.executor.create_task(
             self.run_vision_pipeline, latest_img_msg, caption
         )
@@ -1287,13 +1307,13 @@ async def execute_callback(
             and not goal_handle.is_cancel_requested
             and not vision_pipeline_task.done()
         ):
-            feedback.elapsed_time = (self.get_clock().now() - starting_time).to_msg()
+            feedback.elapsed_time = (self._node.get_clock().now() - starting_time).to_msg()
             goal_handle.publish_feedback(feedback)
             rate.sleep()
 
         # If there is a cancel request, cancel the vision pipeline task
         if goal_handle.is_cancel_requested:
-            self.get_logger().info("Goal cancelled.")
+            self._node.get_logger().info("Goal cancelled.")
             goal_handle.canceled()
             result = SegmentAllItems.Result()
             result.status = result.STATUS_CANCELLED
@@ -1305,8 +1325,8 @@ async def execute_callback(
             return result
 
         # Set the result after the task has been completed
-        self.get_logger().info("Goal not cancelled.")
-        self.get_logger().info("VIsion pipeline completed successfully.")
+        self._node.get_logger().info("Goal not cancelled.")
+        self._node.get_logger().info("VIsion pipeline completed successfully.")
         result = vision_pipeline_task.result()
         goal_handle.succeed()
         result.status = result.STATUS_SUCCEEDED
@@ -1324,7 +1344,8 @@ def main(args=None):
     """
     rclpy.init(args=args)
 
-    segment_all_items = SegmentAllItemsNode()
+    node = ADAFeedingPerceptionNode("segment_all_items")
+    segment_all_items = SegmentAllItemsNode(node)
 
     # Use a MultiThreadedExecutor to enable processing goals concurrently
     executor = MultiThreadedExecutor(num_threads=5)
diff --git a/ada_feeding_perception/launch/ada_feeding_perception.launch.py b/ada_feeding_perception/launch/ada_feeding_perception.launch.py
index ed08847f..3244560e 100755
--- a/ada_feeding_perception/launch/ada_feeding_perception.launch.py
+++ b/ada_feeding_perception/launch/ada_feeding_perception.launch.py
@@ -229,6 +229,8 @@ def generate_launch_description():
         parameters=[
             segment_from_point_config,
             segment_from_point_params,
+            segment_all_items_config,
+            segment_all_items_params,
             face_detection_config,
             face_detection_params,
             table_detection_config,

From 36885419448b2184b88eec7d6596e98fd02facfd Mon Sep 17 00:00:00 2001
From: sriramk117 <sriramku@outlook.com>
Date: Sat, 9 Nov 2024 00:12:59 -0800
Subject: [PATCH 30/42] fixed error of topics not being received by
 segmentallitems action

---
 .../acquisition/compute_food_frame.py         |  2 +-
 .../ada_feeding_perception_node.py            |  4 +-
 .../segment_all_items.py                      | 82 ++++++++-----------
 .../config/republisher.yaml                   | 24 ++++++
 .../config/segment_all_items.yaml             | 36 ++++++++
 .../launch/ada_feeding_perception.launch.py   | 35 ++++----
 6 files changed, 113 insertions(+), 70 deletions(-)

diff --git a/ada_feeding/ada_feeding/behaviors/acquisition/compute_food_frame.py b/ada_feeding/ada_feeding/behaviors/acquisition/compute_food_frame.py
index baf31135..1b7f5ca4 100644
--- a/ada_feeding/ada_feeding/behaviors/acquisition/compute_food_frame.py
+++ b/ada_feeding/ada_feeding/behaviors/acquisition/compute_food_frame.py
@@ -336,7 +336,7 @@ def update(self) -> py_trees.common.Status:
             x_unit.vector, x_pos.vector
         )
 
-        # # If you need to send a fixed food frame to the robot arm, e.g., to 
+        # # If you need to send a fixed food frame to the robot arm, e.g., to
         # # debug off-centering issues, uncomment this and modify the translation.
         # deg = 90  # fork roll
         # world_to_food_transform.transform.translation.x = 0.26262263022586224
diff --git a/ada_feeding_perception/ada_feeding_perception/ada_feeding_perception_node.py b/ada_feeding_perception/ada_feeding_perception/ada_feeding_perception_node.py
index b65ae163..89220647 100755
--- a/ada_feeding_perception/ada_feeding_perception/ada_feeding_perception_node.py
+++ b/ada_feeding_perception/ada_feeding_perception/ada_feeding_perception_node.py
@@ -170,8 +170,8 @@ def main(args=None):
     # pylint: disable=import-outside-toplevel
     from ada_feeding_perception.face_detection import FaceDetectionNode
     from ada_feeding_perception.food_on_fork_detection import FoodOnForkDetectionNode
-    from ada_feeding_perception.segment_from_point import SegmentFromPointNode
     from ada_feeding_perception.segment_all_items import SegmentAllItemsNode
+    from ada_feeding_perception.segment_from_point import SegmentFromPointNode
     from ada_feeding_perception.table_detection import TableDetectionNode
 
     rclpy.init(args=args)
@@ -179,8 +179,8 @@ def main(args=None):
     node = ADAFeedingPerceptionNode("ada_feeding_perception")
     face_detection = FaceDetectionNode(node)
     food_on_fork_detection = FoodOnForkDetectionNode(node)
-    segment_from_point = SegmentFromPointNode(node)  # pylint: disable=unused-variable
     segment_all_items = SegmentAllItemsNode(node)  # pylint: disable=unused-variable
+    segment_from_point = SegmentFromPointNode(node)  # pylint: disable=unused-variable
     table_detection = TableDetectionNode(node)
     executor = MultiThreadedExecutor(num_threads=16)
 
diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index b67d2187..6b58cefe 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -57,6 +57,7 @@
 )
 from ada_feeding_perception.ada_feeding_perception_node import ADAFeedingPerceptionNode
 
+
 class SegmentAllItemsNode(Node):
     """
     The SegmentAllItemsNode launches an action server that segments all food
@@ -75,7 +76,6 @@ def __init__(self, node: ADAFeedingPerceptionNode):
               and camera info.
         """
         self._node = node
-        super().__init__("segment_all_items")
 
         # Check if cuda is available
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -103,14 +103,20 @@ def __init__(self, node: ADAFeedingPerceptionNode):
         # Download the checkpoint for SAM/EfficientSAM if it doesn't exist
         seg_model_path = os.path.join(model_dir, seg_model_name)
         if not os.path.isfile(seg_model_path):
-            self._node.get_logger().info("Model checkpoint does not exist. Downloading...")
+            self._node.get_logger().info(
+                "Model checkpoint does not exist. Downloading..."
+            )
             download_checkpoint(seg_model_name, model_dir, seg_model_base_url)
-            self._node.get_logger().info(f"Model checkpoint downloaded {seg_model_path}.")
+            self._node.get_logger().info(
+                f"Model checkpoint downloaded {seg_model_path}."
+            )
 
         # Download the checkpoint for GroundingDINO if it doesn't exist
         groundingdino_model_path = os.path.join(model_dir, groundingdino_model_name)
         if not os.path.isfile(groundingdino_model_path):
-            self._node.get_logger().info("Model checkpoint does not exist. Downloading...")
+            self._node.get_logger().info(
+                "Model checkpoint does not exist. Downloading..."
+            )
             download_checkpoint(
                 groundingdino_model_name, model_dir, groundingdino_model_base_url
             )
@@ -185,7 +191,7 @@ def __init__(self, node: ADAFeedingPerceptionNode):
         self.active_goal_request = None
 
         # Create the service that invokes GPT-4o
-        self.srv = self.create_service(
+        self.srv = self._node.create_service(
             GenerateCaption,
             "~/invoke_gpt4o",
             self.invoke_gpt4o_callback,
@@ -195,7 +201,7 @@ def __init__(self, node: ADAFeedingPerceptionNode):
         # Create the Action Server.
         # Note: remapping action names does not work: https://github.com/ros2/ros2/issues/1312
         self._action_server = ActionServer(
-            self,
+            self._node,
             SegmentAllItems,
             "SegmentAllItems",
             self.execute_callback,
@@ -207,7 +213,7 @@ def __init__(self, node: ADAFeedingPerceptionNode):
         # If the GroundingDINO results visualization flage is set, then a publisher
         # is created to visualize the bounding box predictions of GroundingDINO
         if self.viz_groundingdino:
-            self.viz_groundingdino_pub = self.create_publisher(
+            self.viz_groundingdino_pub = self._node.create_publisher(
                 Image, "~/groundingdino_detection", 1
             )
 
@@ -240,7 +246,7 @@ def read_params(
             min_depth_mm,
             max_depth_mm,
             viz_groundingdino,
-        ) = self.declare_parameters(
+        ) = self._node.declare_parameters(
             "",
             [
                 (
@@ -536,39 +542,6 @@ def initialize_efficient_sam(self, model_name: str, model_path: str) -> None:
 
         self._node.get_logger().info("...Done!")
 
-    def camera_info_callback(self, msg: CameraInfo) -> None:
-        """
-        Store the latest camera info message.
-
-        Parameters
-        ----------
-        msg: The camera info message.
-        """
-        with self.camera_info_lock:
-            self.camera_info = msg
-
-    def depth_image_callback(self, msg: Union[Image, CompressedImage]) -> None:
-        """
-        Store the latest depth image message.
-
-        Parameters
-        ----------
-        msg: The depth image message.
-        """
-        with self.latest_depth_img_msg_lock:
-            self.latest_depth_img_msg = msg
-
-    def image_callback(self, msg: Union[Image, CompressedImage]) -> None:
-        """
-        Store the latest image message.
-
-        Parameters
-        ----------
-        msg: The image message.
-        """
-        with self.latest_img_msg_lock:
-            self.latest_img_msg = msg
-
     def goal_callback(self, goal_request: SegmentAllItems.Goal) -> GoalResponse:
         """
         Accept or reject the goal request based on the availability of the latest
@@ -580,7 +553,7 @@ def goal_callback(self, goal_request: SegmentAllItems.Goal) -> GoalResponse:
         """
         # If no RGB image is received, reject the goal request
         self._node.get_logger().info("Received goal request...")
-        latest_rgb_img_msg = None
+        latest_rgb_img_msg = self._node.get_latest_msg(self.rgb_image_topic)
         if latest_rgb_img_msg is None:
             self._node.get_logger().info(
                 "Rejecting goal request because no color image was received"
@@ -998,7 +971,7 @@ def generate_mask_msg(
         depth_img: npt.NDArray,
         bbox: Tuple[int, int, int, int],
     ) -> Optional[Mask]:
-        """ 
+        """
         Convert a mask detected by EfficientSAM or SAM into a ROS Mask message.
 
         Parameters
@@ -1112,9 +1085,6 @@ def visualize_groundingdino_results(self, image: Image, predictions: dict):
                     thickness // 3,
                 )
 
-        cv2.imshow("image", image_copy)
-        cv2.waitKey(0)
-        cv2.destroyAllWindows()
         # Publish the image
         self.viz_groundingdino_pub.publish(
             cv2_image_to_ros_msg(image_copy, compress=False, bridge=self.bridge)
@@ -1293,9 +1263,15 @@ async def execute_callback(
         caption = goal_handle.request.caption
         self._node.get_logger().info(f"caption: {caption}")
 
-        # Start running the vision pipeline as a separate thread
+        # Create a rate object to control the rate of the vision pipeline
         rate = self._node.create_rate(self.rate_hz)
-        vision_pipeline_task = self.executor.create_task(
+
+        # Define a cleanup function to destroy the rate
+        def cleanup():
+            self._node.destroy_rate(rate)
+
+        # Start running the vision pipeline as a separate thread
+        vision_pipeline_task = self._node.executor.create_task(
             self.run_vision_pipeline, latest_img_msg, caption
         )
 
@@ -1307,7 +1283,9 @@ async def execute_callback(
             and not goal_handle.is_cancel_requested
             and not vision_pipeline_task.done()
         ):
-            feedback.elapsed_time = (self._node.get_clock().now() - starting_time).to_msg()
+            feedback.elapsed_time = (
+                self._node.get_clock().now() - starting_time
+            ).to_msg()
             goal_handle.publish_feedback(feedback)
             rate.sleep()
 
@@ -1322,6 +1300,8 @@ async def execute_callback(
             with self.active_goal_request_lock:
                 self.active_goal_request = None
 
+            # Cleanup the rate
+            cleanup()
             return result
 
         # Set the result after the task has been completed
@@ -1335,6 +1315,8 @@ async def execute_callback(
         with self.active_goal_request_lock:
             self.active_goal_request = None
 
+        # Cleanup the rate
+        cleanup()
         return result
 
 
@@ -1350,7 +1332,7 @@ def main(args=None):
     # Use a MultiThreadedExecutor to enable processing goals concurrently
     executor = MultiThreadedExecutor(num_threads=5)
 
-    rclpy.spin(segment_all_items, executor=executor)
+    rclpy.spin(node, executor=executor)
 
 
 if __name__ == "__main__":
diff --git a/ada_feeding_perception/config/republisher.yaml b/ada_feeding_perception/config/republisher.yaml
index 9a4ff6ac..15a96b5b 100644
--- a/ada_feeding_perception/config/republisher.yaml
+++ b/ada_feeding_perception/config/republisher.yaml
@@ -3,27 +3,47 @@ republisher:
   ros__parameters:
     # The name of the topics to republish from
     from_topics:
+      - /camera/color/image_raw/compressed
+      - /camera/color/camera_info
+      - /camera/aligned_depth_to_color/image_raw/compressedDepth
+      - /camera/aligned_depth_to_color/camera_info
       - /local/camera/aligned_depth_to_color/image_raw/compressedDepth
 
     # The types of topics to republish from
     in_topic_types:
       - sensor_msgs.msg.CompressedImage
+      - sensor_msgs.msg.CameraInfo
+      - sensor_msgs.msg.CompressedImage
+      - sensor_msgs.msg.CameraInfo
+      - sensor_msgs.msg.CompressedImage
 
     # If the republisher should convert types, specify the output type.
     # Currently, the republisher only supports conversions from
     # Image->CompressedImage and vice-versa.
     out_topic_types:
+      - ""
+      - ""
+      - ""
+      - ""
       - sensor_msgs.msg.Image
 
     # The name of the topics to republish to. NOTE: the `prefix` in the launchfile
     # must match the below pattern!
     to_topics:
+      - /local/camera/color/image_raw/compressed
+      - /local/camera/color/camera_info
+      - /local/camera/aligned_depth_to_color/image_raw/compressedDepth
+      - /local/camera/aligned_depth_to_color/camera_info
       - /local/camera/aligned_depth_to_color/depth_octomap
 
     # The target rates (Hz) for the republished topics. Rates <= 0 will publish
     # every message.
     target_rates:
       - 0
+      - 0
+      - 0
+      - 0
+      - 0
 
     # The names of a post-processing functions to apply to the message before
     # republishing it. Current options are:
@@ -38,6 +58,10 @@ republisher:
     # Any of these post-processing functions can be combined in a comma-separated list.
     # If an empty string, no post-processors are applied.
     post_processors:
+      - ""
+      - ""
+      - ""
+      - ""
       - threshold,mask,temporal,spatial # Apply filters to the depth image for the Octomap
     # The binary mask used for "mask" post-processing. This mask will get scaled,
     # possibly disproportionately, to the same of the image.
diff --git a/ada_feeding_perception/config/segment_all_items.yaml b/ada_feeding_perception/config/segment_all_items.yaml
index 0a82d745..32a679f9 100644
--- a/ada_feeding_perception/config/segment_all_items.yaml
+++ b/ada_feeding_perception/config/segment_all_items.yaml
@@ -32,3 +32,39 @@ segment_all_items:
     # A boolean to determine whether to visualize the bounding box predictions 
     # made by GroundingDINO
     viz_groundingdino: false
+
+# NOTE: If using the combined perception node, be very careful to ensure no name clashes of parameters!
+ada_feeding_perception:
+  ros__parameters:
+    # The name of the Segment Anything model to use
+    sam_model_name: sam_vit_b_01ec64.pth
+    # The URL to download the model checkpoint from if it is not already downloaded
+    sam_model_base_url: "https://dl.fbaipublicfiles.com/segment_anything/"
+
+    # The name of the Efficient Segment Anything model checkpoint to use
+    efficient_sam_model_name: efficient_sam_vitt.pt
+    # The URL to download the model checkpoint from if it is not already downloaded
+    efficient_sam_model_base_url: "https://raw.githubusercontent.com/yformer/EfficientSAM/main/weights/"
+
+    # The path to the configuration file for GroundingDINO
+    groundingdino_config_name: GroundingDINO_SwinT_OGC.py
+    # The name of the GroundingDINO model checkpoint to use
+    groundingdino_model_name: groundingdino_swint_ogc.pth
+    # The URL to download the model checkpoint from if it is not already downloaded
+    groundingdino_model_base_url: "https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth"
+
+    # Whether to use SAM or EfficientSAM
+    use_efficient_sam: true
+
+    # The rate (hz) at which to return feedback
+    rate_hz: 10.0
+
+    # The threshold for bounding box detections by GroundingDINO
+    box_threshold: 0.30
+    # The threshold for text detections by GroundingDINO
+    text_threshold: 0.25
+
+    # A boolean to determine whether to visualize the bounding box predictions 
+    # made by GroundingDINO
+    viz_groundingdino: false
+    
\ No newline at end of file
diff --git a/ada_feeding_perception/launch/ada_feeding_perception.launch.py b/ada_feeding_perception/launch/ada_feeding_perception.launch.py
index 3244560e..10fd768c 100755
--- a/ada_feeding_perception/launch/ada_feeding_perception.launch.py
+++ b/ada_feeding_perception/launch/ada_feeding_perception.launch.py
@@ -104,6 +104,24 @@ def generate_launch_description():
         ),
     ]
 
+    # Load the segment all items node
+    segment_all_items_config = os.path.join(
+        ada_feeding_perception_share_dir, "config", "segment_all_items.yaml"
+    )
+    segment_all_items_params = {}
+    segment_all_items_params["model_dir"] = ParameterValue(
+        os.path.join(ada_feeding_perception_share_dir, "model"), value_type=str
+    )
+    segment_all_items = Node(
+        package="ada_feeding_perception",
+        name="segment_all_items",
+        executable="segment_all_items",
+        parameters=[segment_all_items_config, segment_all_items_params],
+        remappings=realsense_remappings + aligned_depth_remapping,
+        condition=UnlessCondition(combine_perception_nodes),
+    )
+    launch_description.add_action(segment_all_items)
+
     # Load the segment from point node
     segment_from_point_config = os.path.join(
         ada_feeding_perception_share_dir, "config", "segment_from_point.yaml"
@@ -122,23 +140,6 @@ def generate_launch_description():
     )
     launch_description.add_action(segment_from_point)
 
-    # Load the segment all items node
-    segment_all_items_config = os.path.join(
-        ada_feeding_perception_share_dir, "config", "segment_all_items.yaml"
-    )
-    segment_all_items_params = {}
-    segment_all_items_params["model_dir"] = ParameterValue(
-        os.path.join(ada_feeding_perception_share_dir, "model"), value_type=str
-    )
-    segment_all_items = Node(
-        package="ada_feeding_perception",
-        name="segment_all_items",
-        executable="segment_all_items",
-        parameters=[segment_all_items_config, segment_all_items_params],
-        remappings=realsense_remappings + aligned_depth_remapping,
-    )
-    launch_description.add_action(segment_all_items)
-
     # Load the face detection node
     face_detection_config = os.path.join(
         ada_feeding_perception_share_dir, "config", "face_detection.yaml"

From 195b123eb167f1412181e61cb5d65d816b5a27dc Mon Sep 17 00:00:00 2001
From: Sriram Kutty <sriramkutty@Srirams-MacBook-Air.local>
Date: Sat, 9 Nov 2024 00:56:58 -0800
Subject: [PATCH 31/42] code cleanup

---
 .../segment_all_items.py                      | 82 +++----------------
 1 file changed, 11 insertions(+), 71 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 6b58cefe..bff2e940 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -630,7 +630,6 @@ def invoke_gpt4o_callback(
 
         # Run GPT-4o to generate a caption for the image
         vlm_query = self.run_gpt4o(image, request.input_labels)
-        self._node.get_logger().info(f"GPT-4o Query: {vlm_query}")
 
         # Set the response message to the caption generated by GPT-4o
         response.caption = vlm_query
@@ -864,7 +863,6 @@ def run_grounding_dino(
 
         # Run GroundingDINO on the image using the input caption
         image_transformed = image_transformed.to(device=self.device)
-        # self._node.get_logger().info(f"device: {self.device}")
         with torch.no_grad():
             outputs = self.groundingdino(
                 image_transformed[None],
@@ -882,10 +880,6 @@ def run_grounding_dino(
         logits_filt = logits_filt[filt_thresh_mask]
         boxes_filt = boxes_filt[filt_thresh_mask]
 
-        # self._node.get_logger().info(f"Caption: {caption}")
-        # self._node.get_logger().info(f"Boxes: {boxes_filt}")
-        # self._node.get_logger().info(f"Logits: {logits_filt}")
-
         # Tokenize the caption
         tokenizer = self.groundingdino.tokenizer
         caption_tokens = tokenizer(caption)
@@ -896,15 +890,12 @@ def run_grounding_dino(
             phrase = get_phrases_from_posmap(
                 logit > text_threshold, caption_tokens, tokenizer
             )
-            # self._node.get_logger().info(f"logit: {logit}, box: {box}")
-            # self._node.get_logger().info(f"{phrase}")
             if phrase not in boxes_cxcywh:
                 boxes_cxcywh[phrase] = []
             boxes_cxcywh[phrase].append(box.cpu().numpy())
 
         # Define height and width of image
         height, width, _ = image.shape
-        # self._node.get_logger().info(f"height, width: {height}, {width}")
 
         # Convert the bounding boxes outputted by GroundingDINO to the following format
         # [top left x-value, top left y-value, bottom right x-value, bottom right y-value]
@@ -923,11 +914,8 @@ def run_grounding_dino(
                 y1 = y0 + h
                 boxes_xyxy[phrase].append([x0, y0, x1, y1])
 
-        # self._node.get_logger().info(f"Predictions: {boxes_xyxy}")
-
         # Measure the elapsed time running GroundingDINO on the image prompt
         inference_time = int(round((time.time() - inference_time) * 1000))
-        # self._node.get_logger().info(f"Approx. Inference Time: {inference_time}")
 
         return boxes_xyxy
 
@@ -949,7 +937,6 @@ def load_image(self, image_array: npt.NDArray):
 
         # Convert image to image pillow to apply transformation
         image_pil = ImagePIL.fromarray(image_array, mode="RGB")
-        # image_pil.show()
         transform = T.Compose(
             [
                 T.RandomResize([800], max_size=1333),
@@ -1064,10 +1051,10 @@ def visualize_groundingdino_results(self, image: Image, predictions: dict):
         # Define height of image
         height, _, _ = image_copy.shape
 
+        # Draw bounding boxes and text labels for each prediction on the image
         for phrase, boxes in predictions.items():
             for box in boxes:
                 x0, y0, x1, y1 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
-                # self._node.get_logger().info(f"box: {x0}, {y0}, {x1}, {y1}")
                 color = (0, 255, 0)
                 thickness = 6
                 image_copy = cv2.rectangle(
@@ -1085,66 +1072,20 @@ def visualize_groundingdino_results(self, image: Image, predictions: dict):
                     thickness // 3,
                 )
 
-        # Publish the image
+        # Publish the image as a ROS message
         self.viz_groundingdino_pub.publish(
             cv2_image_to_ros_msg(image_copy, compress=False, bridge=self.bridge)
         )
 
-    def display_masks(
-        self, image: npt.NDArray, masks: list[npt.NDArray], item_labels: list[str]
-    ):
-        """
-        Display the masks on the image.
-
-        Parameters
-        ----------
-        image: The image to display the masks on.
-        masks: The masks to display on the image.
-        """
-        self._node.get_logger().info("Displaying masks...")
-
-        # Create a deep copy of the image to visualize
-        image_copy = deepcopy(image)
-
-        # Display the masks on the image
-        for mask, label in zip(masks, item_labels):
-            mask = mask.astype(np.uint8)
-            mask = cv2.resize(mask, (image.shape[1], image.shape[0]))
-            color_dims = 3
-            mask = np.stack([mask] * color_dims, axis=-1)
-            color_scalar = random.randint(80, 255)
-            mask = np.multiply(mask, color_scalar)
-            cv2.imshow(label, mask)
-            cv2.waitKey(0)
-            cv2.destroyAllWindows()
-            # self._node.get_logger().info(f"Mask max: {np.max(mask)}")
-            # image_copy = cv2.addWeighted(image, 1.0, mask, 0.3, 0)
-
-        # cv2.imshow("Masks", image)
-        # cv2.waitKey(0)
-        # cv2.destroyAllWindows()
-
-    def display_mask(self, image: npt.NDArray, mask: npt.NDArray):
-        """ """
-        image_copy = deepcopy(image)
-        mask_int = mask.astype(np.uint8)
-        mask_int = cv2.resize(mask_int, (image.shape[1], image.shape[0]))
-        color_dims = 3
-        mask_int = np.stack([mask_int] * color_dims, axis=-1)
-        color_scalar = random.randint(80, 255)
-        mask_int = np.multiply(mask_int, color_scalar)
-        cv2.imshow("mask", mask_int)
-        cv2.waitKey(0)
-        cv2.destroyAllWindows()
-
     async def run_vision_pipeline(self, image_msg: Image, caption: str):
         """
         Run the vision pipeline consisting of two foundation models, GroundingDINO and
         EfficientSAM, on the image. GroundingDINO is prompted with a caption and the latest image,
         and outputs bounding boxes for each semantic label in the caption detected in the
-        image. The detected items are then segmented by passing in the bounding box detections
-        into EfficientSAM which outputs pixel-wise masks for each bounding box. The top masks
-        are then returned along with the semantic label for each mask as a dictionary.
+        image. The detected bounding boxes + semantic label pairs are then segmented by passing 
+        in the bounding box detections into EfficientSAM which outputs pixel-wise masks for each
+        bounding box. The top masks are then returned along with the semantic label for each mask 
+        as a dictionary.
 
         Parameters
         ----------
@@ -1203,7 +1144,6 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
             for box in boxes:
                 masks, scores = self.run_efficient_sam(image, None, box, 1)
                 if len(masks) > 0:
-                    # self._node.get_logger().info(f"Mask: {masks[0]}")
                     masks_list.append(masks[0])
                     item_id = f"food_id_{mask_num:d}"
                     mask_num += 1
@@ -1213,16 +1153,13 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
                     detected_items.append(mask_msg)
                     item_labels.append(phrase)
 
-        # self.display_masks(image, masks_list, item_labels)
-        # self._node.get_logger().info(f"Detected items: {detected_items}")
-        self._node.get_logger().info(f"Item_labels: {item_labels}")
         result.detected_items = detected_items
         result.item_labels = item_labels
 
         # Measure the elapsed time running GroundingDINO on the image prompt
         inference_time = int(round((time.time() - inference_time) * 1000))
         self._node.get_logger().info(
-            f"VISION PIPELINE - Approx. Inference Time: {inference_time}"
+            f"Approximate Vision Pipeline Inference Time: {inference_time}"
         )
 
         return result
@@ -1261,7 +1198,6 @@ async def execute_callback(
 
         # Get the caption from the goal request
         caption = goal_handle.request.caption
-        self._node.get_logger().info(f"caption: {caption}")
 
         # Create a rate object to control the rate of the vision pipeline
         rate = self._node.create_rate(self.rate_hz)
@@ -1334,6 +1270,10 @@ def main(args=None):
 
     rclpy.spin(node, executor=executor)
 
+    # Destroy the node
+    node.destroy_node()
+    rclpy.shutdown()
+
 
 if __name__ == "__main__":
     main()

From b8a4ccbdb2bdeff0f866d22c10d522437a481a06 Mon Sep 17 00:00:00 2001
From: sriramk117 <sriramku@outlook.com>
Date: Mon, 2 Dec 2024 17:10:49 -0800
Subject: [PATCH 32/42] running gpt-4o inference is now an action not a service

---
 ada_feeding_msgs/CMakeLists.txt               |   2 +-
 .../action/GenerateCaption.action             |  28 +++
 .../action/SegmentAllItems.action             |   4 +-
 ada_feeding_msgs/srv/GenerateCaption.srv      |  11 --
 .../segment_all_items.py                      | 173 ++++++++++++------
 5 files changed, 152 insertions(+), 66 deletions(-)
 create mode 100644 ada_feeding_msgs/action/GenerateCaption.action
 delete mode 100644 ada_feeding_msgs/srv/GenerateCaption.srv

diff --git a/ada_feeding_msgs/CMakeLists.txt b/ada_feeding_msgs/CMakeLists.txt
index 6693e79f..10c09fff 100644
--- a/ada_feeding_msgs/CMakeLists.txt
+++ b/ada_feeding_msgs/CMakeLists.txt
@@ -24,6 +24,7 @@ rosidl_generate_interfaces(${PROJECT_NAME}
 
   "action/AcquireFood.action"
   "action/ActivateController.action"
+  "action/GenerateCaption.action"
   "action/MoveTo.action"
   "action/MoveToConfiguration.action"
   "action/MoveToMouth.action"
@@ -35,7 +36,6 @@ rosidl_generate_interfaces(${PROJECT_NAME}
   "srv/AcquisitionReport.srv"
   "srv/AcquisitionSelect.srv"
   "srv/GetRobotState.srv"
-  "srv/GenerateCaption.srv"
   "srv/ModifyCollisionObject.srv"
 
   DEPENDENCIES geometry_msgs sensor_msgs std_msgs
diff --git a/ada_feeding_msgs/action/GenerateCaption.action b/ada_feeding_msgs/action/GenerateCaption.action
new file mode 100644
index 00000000..379eb8d8
--- /dev/null
+++ b/ada_feeding_msgs/action/GenerateCaption.action
@@ -0,0 +1,28 @@
+# The interface for an action that takes in a list of input labels
+# describing the food items on a plate and returns a sentence caption compiling
+# these labels used as a query for GroundingDINO detection.
+
+# A list of semantic labels corresponding to each of the masks of detected
+# items in the image
+string[] input_labels
+---
+# Possible return statuses
+uint8 STATUS_SUCCEEDED=0
+uint8 STATUS_FAILED=1
+uint8 STATUS_CANCELED=3
+uint8 STATUS_UNKNOWN=99
+
+# Whether the vision pipeline succeeded and if not, why
+uint8 status
+
+# The header for the image that the generated caption by GPT-4o 
+# corresponds to
+std_msgs/Header header
+# The camera intrinsics
+sensor_msgs/CameraInfo camera_info
+# A sentence caption compiling the semantic labels used as a query for 
+# GroundingDINO to perform bounding box detections.
+string caption
+---
+# How much time the action has spent running inference on GPT-4o
+builtin_interfaces/Duration elapsed_time
diff --git a/ada_feeding_msgs/action/SegmentAllItems.action b/ada_feeding_msgs/action/SegmentAllItems.action
index 682a07f3..cf4f93a3 100644
--- a/ada_feeding_msgs/action/SegmentAllItems.action
+++ b/ada_feeding_msgs/action/SegmentAllItems.action
@@ -10,7 +10,7 @@ uint8 STATUS_FAILED=1
 uint8 STATUS_CANCELED=3
 uint8 STATUS_UNKNOWN=99
 
-# Whether the segmentation succeeded and if not, why
+# Whether the vision pipeline succeeded and if not, why
 uint8 status
 
 # The header for the image that the masks corresponds to
@@ -23,5 +23,5 @@ ada_feeding_msgs/Mask[] detected_items
 # items in the image
 string[] item_labels
 ---
-# How much time the action has spent segmenting the food item
+# How much time the action has spent running the vision pipeline
 builtin_interfaces/Duration elapsed_time
diff --git a/ada_feeding_msgs/srv/GenerateCaption.srv b/ada_feeding_msgs/srv/GenerateCaption.srv
deleted file mode 100644
index 8a1d150a..00000000
--- a/ada_feeding_msgs/srv/GenerateCaption.srv
+++ /dev/null
@@ -1,11 +0,0 @@
-# The interface for a service that takes in a list of input labels
-# describing the food items on a plate and returns a sentence caption compiling
-# these labels used as a query for GroundingDINO detection.
-
-# A list of semantic labels corresponding to each of the masks of detected
-# items in the image
-string[] input_labels
----
-# A sentence caption compiling the semantic labels used as a query for 
-# GroundingDINO to perform bounding box detections.
-string caption
\ No newline at end of file
diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index bff2e940..1a476b45 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -42,9 +42,8 @@
 from dotenv import load_dotenv
 
 # Local imports
-from ada_feeding_msgs.action import SegmentAllItems
+from ada_feeding_msgs.action import SegmentAllItems, GenerateCaption
 from ada_feeding_msgs.msg import Mask
-from ada_feeding_msgs.srv import GenerateCaption
 from ada_feeding_perception.helpers import (
     BoundingBox,
     bbox_from_mask,
@@ -190,11 +189,15 @@ def __init__(self, node: ADAFeedingPerceptionNode):
         self.active_goal_request_lock = threading.Lock()
         self.active_goal_request = None
 
-        # Create the service that invokes GPT-4o
-        self.srv = self._node.create_service(
+        # Create the Action Server to invoke GPT-4o
+        # Note: remapping action names does not work: https://github.com/ros2/ros2/issues/1312
+        self._gpt4o_action_server = ActionServer(
+            self._node,
             GenerateCaption,
-            "~/invoke_gpt4o",
-            self.invoke_gpt4o_callback,
+            "GenerateCaption",
+            execute_callback=self.invoke_gpt4o_callback,
+            goal_callback=self.goal_callback,
+            cancel_callback=self.cancel_callback,
             callback_group=MutuallyExclusiveCallbackGroup(),
         )
 
@@ -204,7 +207,7 @@ def __init__(self, node: ADAFeedingPerceptionNode):
             self._node,
             SegmentAllItems,
             "SegmentAllItems",
-            self.execute_callback,
+            execute_callback=self.execute_callback,
             goal_callback=self.goal_callback,
             cancel_callback=self.cancel_callback,
             callback_group=MutuallyExclusiveCallbackGroup(),
@@ -593,50 +596,7 @@ def cancel_callback(self, _: ServerGoalHandle) -> CancelResponse:
         self._node.get_logger().info("Cancelling the goal request...")
         return CancelResponse.ACCEPT
 
-    def invoke_gpt4o_callback(
-        self, request: GenerateCaption.Request, response: GenerateCaption.Response
-    ):
-        """
-        Callback function for the GPT-4o service. This function takes in a list
-        of string labels describing the foods on an image as input and returns a
-        caption generated by GPT-4o that compiles these labels into a descriptive
-        sentence used as a query for GroundingDINO.
-
-        Parameters
-        ----------
-        request: The given request message.
-        response: The created response message.
-
-        Returns
-        ----------
-        response: The updated response message based on the request.
-        """
-        # Get the latest image and camera info
-        latest_img_msg = self._node.get_latest_msg(self.rgb_image_topic)
-        if self.camera_info is None:
-            self.camera_info = self._node.get_latest_msg(self.camera_info_topic)
-        if self.camera_info is not None:
-            camera_info = self.camera_info
-        else:
-            camera_info = None
-
-        # Check if the image and camera info are available
-        if latest_img_msg is None or camera_info is None:
-            self._node.get_logger().error("Image or camera info not available.")
-            return response
-
-        # Convert the image message to a CV2 image
-        image = ros_msg_to_cv2_image(latest_img_msg)
-
-        # Run GPT-4o to generate a caption for the image
-        vlm_query = self.run_gpt4o(image, request.input_labels)
-
-        # Set the response message to the caption generated by GPT-4o
-        response.caption = vlm_query
-
-        return response
-
-    def run_gpt4o(self, image: npt.NDArray, labels_list: list[str]):
+    def run_gpt4o(self, image_msg: Image, labels_list: list[str]):
         """
         Run GPT-4o on the image.
 
@@ -653,6 +613,9 @@ def run_gpt4o(self, image: npt.NDArray, labels_list: list[str]):
         """
         self._node.get_logger().info("Running GPT-4o...")
 
+        # Convert the image message to a CV2 image
+        image = ros_msg_to_cv2_image(image_msg)
+
         # Encode the image to JPEG format
         _, buffer = cv2.imencode(".jpg", image)
 
@@ -715,7 +678,22 @@ def run_gpt4o(self, image: npt.NDArray, labels_list: list[str]):
         # Get the caption generated by GPT-4o
         vlm_query = response.choices[0].message.content
 
-        return vlm_query
+        # Define the result and create result message header
+        result = GenerateCaption.Result()
+        result.header = image_msg.header
+        if self.camera_info is None:
+            self.camera_info = self._node.get_latest_msg(self.camera_info_topic)
+        if self.camera_info is not None:
+            result.camera_info = self.camera_info
+        else:
+            self._node.get_logger().warn(
+                "Camera info not received, not including in result message"
+            )
+        
+        # Set the caption generated by GPT-4o as the result caption
+        result.caption = vlm_query
+
+        return result
 
     def run_sam(
         self,
@@ -1254,6 +1232,97 @@ def cleanup():
         # Cleanup the rate
         cleanup()
         return result
+    
+    async def invoke_gpt4o_callback(
+        self, goal_handle: ServerGoalHandle
+    ) -> GenerateCaption.Result:
+        """
+        Callback function for the GPT-4o service. This function takes in a list
+        of string labels describing the foods on an image as input and returns a
+        caption generated by GPT-4o that compiles these labels into a descriptive
+        sentence used as a query for GroundingDINO.
+
+        Parameters
+        ----------
+        request: The given request message.
+        response: The created response message.
+
+        Returns
+        ----------
+        response: The updated response message based on the request.
+        """
+        self._node.get_logger().info("Received a new goal!")
+        starting_time = self._node.get_clock().now()
+
+        # Get the latest image and camera info
+        latest_img_msg = self._node.get_latest_msg(self.rgb_image_topic)
+        if self.camera_info is None:
+            self.camera_info = self._node.get_latest_msg(self.camera_info_topic)
+        if self.camera_info is not None:
+            camera_info = self.camera_info
+        else:
+            camera_info = None
+
+        # Check if the image and camera info are available
+        if latest_img_msg is None or camera_info is None:
+            self._node.get_logger().error("Image or camera info not available.")
+            return GenerateCaption.Result()
+
+        # Convert the image message to a CV2 image
+        image = ros_msg_to_cv2_image(latest_img_msg)
+
+        # Get the input labels from the request
+        input_labels = goal_handle.request.input_labels
+
+        # Create a rate object to control the rate at which to return feedback
+        rate = self._node.create_rate(self.rate_hz)
+
+        # Define a cleanup function to destroy the rate
+        def cleanup():
+            self._node.destroy_rate(rate)
+
+        # Start running the GPT-4o inference as a separate thread
+        gpt4o_task = self._node.executor.create_task(self.run_gpt4o, latest_img_msg, input_labels)
+
+        # Keep publishing feedback (elapsed time) while waiting for 
+        # the GPT-4o inference to finish
+        feedback = GenerateCaption.Feedback()
+        while (
+            rclpy.ok()
+            and not goal_handle.is_cancel_requested
+            and not gpt4o_task.done()
+        ):
+            feedback.elapsed_time = (
+                self._node.get_clock().now() - starting_time
+            ).to_msg()
+            goal_handle.publish_feedback(feedback)
+            rate.sleep()
+
+        # If there is a cancel request, cancel the GPT-4o task
+        if goal_handle.is_cancel_requested:
+            self._node.get_logger().info("Goal cancelled.")
+            goal_handle.canceled()
+            response = GenerateCaption.Result()
+            response.status = response.STATUS_CANCELLED
+
+            # Cleanup the rate
+            cleanup()
+            return response
+        
+        # Set the result after inference is complete
+        self._node.get_logger().info("Goal not cancelled.")
+        self._node.get_logger().info("GPT-4o inference completed successfully.")
+        response = gpt4o_task.result()
+        goal_handle.succeed()
+        response.status = response.STATUS_SUCCEEDED
+
+        # Clear the active goal
+        with self.active_goal_request_lock:
+            self.active_goal_request = None
+
+        # Cleanup the rate
+        cleanup()
+        return response
 
 
 def main(args=None):

From 536373254e469e01804f6ea9ad1f5d4dbc6b2c75 Mon Sep 17 00:00:00 2001
From: sriramk117 <sriramku@outlook.com>
Date: Mon, 2 Dec 2024 17:20:05 -0800
Subject: [PATCH 33/42] cleaned up some comments

---
 .../segment_all_items.py                      | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 1a476b45..5955fd5f 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -602,8 +602,8 @@ def run_gpt4o(self, image_msg: Image, labels_list: list[str]):
 
         Parameters
         ----------
-        image: A CV2 above the plate image that is used as reference for GPT-4o
-                to generate a caption input for GroundingDINO.
+        image_msg: An above the plate image (as a ROS image message) that is used as
+                reference for GPT-4o to generate a caption input for GroundingDINO.
         labels_list: The list of food items to compile into a sentence prompt.
 
         Returns
@@ -689,7 +689,7 @@ def run_gpt4o(self, image_msg: Image, labels_list: list[str]):
             self._node.get_logger().warn(
                 "Camera info not received, not including in result message"
             )
-        
+
         # Set the caption generated by GPT-4o as the result caption
         result.caption = vlm_query
 
@@ -1060,14 +1060,14 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         Run the vision pipeline consisting of two foundation models, GroundingDINO and
         EfficientSAM, on the image. GroundingDINO is prompted with a caption and the latest image,
         and outputs bounding boxes for each semantic label in the caption detected in the
-        image. The detected bounding boxes + semantic label pairs are then segmented by passing 
+        image. The detected bounding boxes + semantic label pairs are then segmented by passing
         in the bounding box detections into EfficientSAM which outputs pixel-wise masks for each
-        bounding box. The top masks are then returned along with the semantic label for each mask 
+        bounding box. The top masks are then returned along with the semantic label for each mask
         as a dictionary.
 
         Parameters
         ----------
-        image: The image to segment, as a ROS image message.
+        image_msg: The image to segment, as a ROS image message.
         caption: The caption to use for GroundingDINO containing all the food items
                  detected in the image.
 
@@ -1232,7 +1232,7 @@ def cleanup():
         # Cleanup the rate
         cleanup()
         return result
-    
+
     async def invoke_gpt4o_callback(
         self, goal_handle: ServerGoalHandle
     ) -> GenerateCaption.Result:
@@ -1282,15 +1282,15 @@ def cleanup():
             self._node.destroy_rate(rate)
 
         # Start running the GPT-4o inference as a separate thread
-        gpt4o_task = self._node.executor.create_task(self.run_gpt4o, latest_img_msg, input_labels)
+        gpt4o_task = self._node.executor.create_task(
+            self.run_gpt4o, latest_img_msg, input_labels
+        )
 
-        # Keep publishing feedback (elapsed time) while waiting for 
+        # Keep publishing feedback (elapsed time) while waiting for
         # the GPT-4o inference to finish
         feedback = GenerateCaption.Feedback()
         while (
-            rclpy.ok()
-            and not goal_handle.is_cancel_requested
-            and not gpt4o_task.done()
+            rclpy.ok() and not goal_handle.is_cancel_requested and not gpt4o_task.done()
         ):
             feedback.elapsed_time = (
                 self._node.get_clock().now() - starting_time
@@ -1308,7 +1308,7 @@ def cleanup():
             # Cleanup the rate
             cleanup()
             return response
-        
+
         # Set the result after inference is complete
         self._node.get_logger().info("Goal not cancelled.")
         self._node.get_logger().info("GPT-4o inference completed successfully.")

From d73c983eac4e0a5304e8ff82cd3e5b5efc7523e9 Mon Sep 17 00:00:00 2001
From: sriramk117 <sriramku@outlook.com>
Date: Wed, 4 Dec 2024 16:18:27 -0800
Subject: [PATCH 34/42] goal status cancellation

---
 .../ada_feeding_perception/segment_all_items.py                 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 5955fd5f..5a6e1f3e 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -1303,7 +1303,7 @@ def cleanup():
             self._node.get_logger().info("Goal cancelled.")
             goal_handle.canceled()
             response = GenerateCaption.Result()
-            response.status = response.STATUS_CANCELLED
+            response.status = response.STATUS_CANCELED
 
             # Cleanup the rate
             cleanup()

From 23267420a88fce9b688fd70d6000674c335578f9 Mon Sep 17 00:00:00 2001
From: sriramk117 <sriramku@outlook.com>
Date: Wed, 11 Dec 2024 14:58:22 -0800
Subject: [PATCH 35/42] temporary changes for running testing procedures

---
 .../ada_feeding_perception/segment_all_items.py                | 3 +++
 ada_feeding_perception/config/segment_all_items.yaml           | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 5a6e1f3e..7a490117 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -1050,6 +1050,9 @@ def visualize_groundingdino_results(self, image: Image, predictions: dict):
                     thickness // 3,
                 )
 
+        cv2.imshow("GroundingDINO Predictions", image_copy)
+        cv2.waitKey(0)
+
         # Publish the image as a ROS message
         self.viz_groundingdino_pub.publish(
             cv2_image_to_ros_msg(image_copy, compress=False, bridge=self.bridge)
diff --git a/ada_feeding_perception/config/segment_all_items.yaml b/ada_feeding_perception/config/segment_all_items.yaml
index 32a679f9..a826b6da 100644
--- a/ada_feeding_perception/config/segment_all_items.yaml
+++ b/ada_feeding_perception/config/segment_all_items.yaml
@@ -31,7 +31,7 @@ segment_all_items:
 
     # A boolean to determine whether to visualize the bounding box predictions 
     # made by GroundingDINO
-    viz_groundingdino: false
+    viz_groundingdino: true
 
 # NOTE: If using the combined perception node, be very careful to ensure no name clashes of parameters!
 ada_feeding_perception:

From b95ac8e256e5dbf9a5d12d6c14db0a8ae3ae6ed4 Mon Sep 17 00:00:00 2001
From: sriramk117 <sriramku@outlook.com>
Date: Thu, 2 Jan 2025 11:53:33 -0800
Subject: [PATCH 36/42] republisher.yaml reverted to original

---
 .../segment_all_items.py                      | 16 +++++++++----
 .../config/republisher.yaml                   | 24 -------------------
 .../config/segment_all_items.yaml             |  2 +-
 3 files changed, 12 insertions(+), 30 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 5a6e1f3e..17965c41 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -13,7 +13,6 @@
 # Third-party imports
 import cv2
 import time
-import random
 from cv_bridge import CvBridge
 from efficient_sam.efficient_sam import build_efficient_sam
 import numpy as np
@@ -46,7 +45,6 @@
 from ada_feeding_msgs.msg import Mask
 from ada_feeding_perception.helpers import (
     BoundingBox,
-    bbox_from_mask,
     crop_image_mask_and_point,
     cv2_image_to_ros_msg,
     download_checkpoint,
@@ -221,7 +219,7 @@ def __init__(self, node: ADAFeedingPerceptionNode):
             )
 
         # Initialize the OpenAI API and load environment variables
-        API_KEY = os.getenv("OPENAI_API_KEY")
+        API_KEY = os.getenv("OPENAI_API_KEY") 
         self.openai = OpenAI(api_key=API_KEY)
 
     def read_params(
@@ -841,6 +839,12 @@ def run_grounding_dino(
 
         # Run GroundingDINO on the image using the input caption
         image_transformed = image_transformed.to(device=self.device)
+
+        # Display image transformed
+        image_pil.show()
+        #cv2.imshow("transformed", image_transformed)
+        #cv2.waitKey(0)
+        
         with torch.no_grad():
             outputs = self.groundingdino(
                 image_transformed[None],
@@ -910,7 +914,6 @@ def load_image(self, image_array: npt.NDArray):
         image_pil: The image in Image pillow format.
         image: The image in tensor format.
         """
-        # Convert image from BGR to RGB to convert CV2 image to image pillow
         image_array = cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB)
 
         # Convert image to image pillow to apply transformation
@@ -1050,9 +1053,12 @@ def visualize_groundingdino_results(self, image: Image, predictions: dict):
                     thickness // 3,
                 )
 
+        #cv2.imshow("GroundingDINO Predictions", image_copy)
+        #cv2.waitKey(0)
+
         # Publish the image as a ROS message
         self.viz_groundingdino_pub.publish(
-            cv2_image_to_ros_msg(image_copy, compress=False, bridge=self.bridge)
+            cv2_image_to_ros_msg(image_copy, compress=False, bridge=self.bridge, encoding="bgr8")
         )
 
     async def run_vision_pipeline(self, image_msg: Image, caption: str):
diff --git a/ada_feeding_perception/config/republisher.yaml b/ada_feeding_perception/config/republisher.yaml
index 15a96b5b..9a4ff6ac 100644
--- a/ada_feeding_perception/config/republisher.yaml
+++ b/ada_feeding_perception/config/republisher.yaml
@@ -3,47 +3,27 @@ republisher:
   ros__parameters:
     # The name of the topics to republish from
     from_topics:
-      - /camera/color/image_raw/compressed
-      - /camera/color/camera_info
-      - /camera/aligned_depth_to_color/image_raw/compressedDepth
-      - /camera/aligned_depth_to_color/camera_info
       - /local/camera/aligned_depth_to_color/image_raw/compressedDepth
 
     # The types of topics to republish from
     in_topic_types:
       - sensor_msgs.msg.CompressedImage
-      - sensor_msgs.msg.CameraInfo
-      - sensor_msgs.msg.CompressedImage
-      - sensor_msgs.msg.CameraInfo
-      - sensor_msgs.msg.CompressedImage
 
     # If the republisher should convert types, specify the output type.
     # Currently, the republisher only supports conversions from
     # Image->CompressedImage and vice-versa.
     out_topic_types:
-      - ""
-      - ""
-      - ""
-      - ""
       - sensor_msgs.msg.Image
 
     # The name of the topics to republish to. NOTE: the `prefix` in the launchfile
     # must match the below pattern!
     to_topics:
-      - /local/camera/color/image_raw/compressed
-      - /local/camera/color/camera_info
-      - /local/camera/aligned_depth_to_color/image_raw/compressedDepth
-      - /local/camera/aligned_depth_to_color/camera_info
       - /local/camera/aligned_depth_to_color/depth_octomap
 
     # The target rates (Hz) for the republished topics. Rates <= 0 will publish
     # every message.
     target_rates:
       - 0
-      - 0
-      - 0
-      - 0
-      - 0
 
     # The names of a post-processing functions to apply to the message before
     # republishing it. Current options are:
@@ -58,10 +38,6 @@ republisher:
     # Any of these post-processing functions can be combined in a comma-separated list.
     # If an empty string, no post-processors are applied.
     post_processors:
-      - ""
-      - ""
-      - ""
-      - ""
       - threshold,mask,temporal,spatial # Apply filters to the depth image for the Octomap
     # The binary mask used for "mask" post-processing. This mask will get scaled,
     # possibly disproportionately, to the same of the image.
diff --git a/ada_feeding_perception/config/segment_all_items.yaml b/ada_feeding_perception/config/segment_all_items.yaml
index 32a679f9..a826b6da 100644
--- a/ada_feeding_perception/config/segment_all_items.yaml
+++ b/ada_feeding_perception/config/segment_all_items.yaml
@@ -31,7 +31,7 @@ segment_all_items:
 
     # A boolean to determine whether to visualize the bounding box predictions 
     # made by GroundingDINO
-    viz_groundingdino: false
+    viz_groundingdino: true
 
 # NOTE: If using the combined perception node, be very careful to ensure no name clashes of parameters!
 ada_feeding_perception:

From 9382b67d04216a243a5fc1127a1eea500ab62e66 Mon Sep 17 00:00:00 2001
From: sriramk117 <sriramku@outlook.com>
Date: Thu, 2 Jan 2025 16:05:58 -0800
Subject: [PATCH 37/42] segmentation inference optimization workin

---
 ada_feeding_msgs/CMakeLists.txt               |   1 +
 .../action/SegmentAllItems.action             |   6 +-
 ada_feeding_msgs/action/SegmentFromBox.action |  28 ++
 .../segment_all_items.py                      | 297 ++++++++++++++----
 4 files changed, 275 insertions(+), 57 deletions(-)
 create mode 100644 ada_feeding_msgs/action/SegmentFromBox.action

diff --git a/ada_feeding_msgs/CMakeLists.txt b/ada_feeding_msgs/CMakeLists.txt
index 10c09fff..c3d38c78 100644
--- a/ada_feeding_msgs/CMakeLists.txt
+++ b/ada_feeding_msgs/CMakeLists.txt
@@ -29,6 +29,7 @@ rosidl_generate_interfaces(${PROJECT_NAME}
   "action/MoveToConfiguration.action"
   "action/MoveToMouth.action"
   "action/SegmentAllItems.action"
+  "action/SegmentFromBox.action"
   "action/SegmentFromPoint.action"
   "action/Teleoperate.action"
   "action/Trigger.action"
diff --git a/ada_feeding_msgs/action/SegmentAllItems.action b/ada_feeding_msgs/action/SegmentAllItems.action
index cf4f93a3..0933a71d 100644
--- a/ada_feeding_msgs/action/SegmentAllItems.action
+++ b/ada_feeding_msgs/action/SegmentAllItems.action
@@ -1,5 +1,5 @@
 # The interface for an action that gets an image from the camera and returns
-# the masks of all segmented items within that image.
+# the bounding boxes of all items within that image.
 
 # The list of input semantic labels for the food items on the plate
 string caption   
@@ -17,8 +17,8 @@ uint8 status
 std_msgs/Header header
 # The camera intrinsics
 sensor_msgs/CameraInfo camera_info
-# Masks of all the detected items in the image
-ada_feeding_msgs/Mask[] detected_items
+# Bounding boxes of all the detected items in the image
+sensor_msgs/RegionOfInterest[] detected_items
 # A list of semantic labels corresponding to each of the masks of detected
 # items in the image
 string[] item_labels
diff --git a/ada_feeding_msgs/action/SegmentFromBox.action b/ada_feeding_msgs/action/SegmentFromBox.action
new file mode 100644
index 00000000..866fc89c
--- /dev/null
+++ b/ada_feeding_msgs/action/SegmentFromBox.action
@@ -0,0 +1,28 @@
+# The interface for an action that gets an image from the camera and a bounding
+# box of the desired item to segment, and then returns the pixel-wise mask
+# of that item
+
+# The region of interest (bounding box) to seed the segmentation algorithm with
+sensor_msgs/RegionOfInterest region_of_interest
+
+# The semantic label describing the item bounded by the region of interest 
+string label
+---
+# Possible return statuses
+uint8 STATUS_SUCCEEDED=0
+uint8 STATUS_FAILED=1
+uint8 STATUS_CANCELED=3
+uint8 STATUS_UNKNOWN=99
+
+# Whether the segmentation succeeded and if not, why
+uint8 status
+
+# The header for the image that the masks corresponds to
+std_msgs/Header header
+# The camera intrinsics
+sensor_msgs/CameraInfo camera_info
+# Top contender mask segmented given a bounding box of an item
+ada_feeding_msgs/Mask detected_item
+---
+# How much time the action has spent segmenting the food item
+builtin_interfaces/Duration elapsed_time
\ No newline at end of file
diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 73d301c5..1399df41 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -13,6 +13,7 @@
 # Third-party imports
 import cv2
 import time
+import random
 from cv_bridge import CvBridge
 from efficient_sam.efficient_sam import build_efficient_sam
 import numpy as np
@@ -41,7 +42,7 @@
 from dotenv import load_dotenv
 
 # Local imports
-from ada_feeding_msgs.action import SegmentAllItems, GenerateCaption
+from ada_feeding_msgs.action import SegmentAllItems, GenerateCaption, SegmentFromBox
 from ada_feeding_msgs.msg import Mask
 from ada_feeding_perception.helpers import (
     BoundingBox,
@@ -211,6 +212,18 @@ def __init__(self, node: ADAFeedingPerceptionNode):
             callback_group=MutuallyExclusiveCallbackGroup(),
         )
 
+        # Create the Action Server to perform segmentation from a bounding box
+        # Note: remapping action names does not work: https://github.com/ros2/ros2/issues/1312
+        self._segment_action_server = ActionServer(
+            self._node,
+            SegmentFromBox,
+            "SegmentFromBox",
+            execute_callback=self.segmentation_callback,
+            goal_callback=self.goal_callback,
+            cancel_callback=self.cancel_callback,
+            callback_group=MutuallyExclusiveCallbackGroup(),
+        )
+
         # If the GroundingDINO results visualization flage is set, then a publisher
         # is created to visualize the bounding box predictions of GroundingDINO
         if self.viz_groundingdino:
@@ -219,7 +232,7 @@ def __init__(self, node: ADAFeedingPerceptionNode):
             )
 
         # Initialize the OpenAI API and load environment variables
-        API_KEY = os.getenv("OPENAI_API_KEY") 
+        API_KEY = os.getenv("OPENAI_API_KEY")
         self.openai = OpenAI(api_key=API_KEY)
 
     def read_params(
@@ -1015,6 +1028,31 @@ def generate_mask_msg(
         mask_msg.confidence = float(score)
 
         return mask_msg
+    
+    def display_mask(self, image: npt.NDArray, mask: npt.NDArray, item_label: str):
+        """
+        Display the masks on the image.
+
+        Parameters
+        ----------
+        image: The image to display the masks on.
+        masks: The masks to display on the image.
+        """
+        self._node.get_logger().info("Displaying masks...")
+
+        # Create a deep copy of the image to visualize
+        image_copy = deepcopy(image)
+
+        # Display the mask on the image
+        mask = mask.astype(np.uint8)
+        mask = cv2.resize(mask, (image.shape[1], image.shape[0]))
+        color_dims = 3
+        mask = np.stack([mask] * color_dims, axis=-1)
+        color_scalar = random.randint(80, 255)
+        mask = np.multiply(mask, color_scalar)
+        cv2.imshow(item_label, mask)
+        cv2.waitKey(0)
+        cv2.destroyAllWindows()
 
     def visualize_groundingdino_results(self, image: Image, predictions: dict):
         """
@@ -1126,16 +1164,26 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         mask_num = 1
         for phrase, boxes in bbox_predictions.items():
             for box in boxes:
-                masks, scores = self.run_efficient_sam(image, None, box, 1)
-                if len(masks) > 0:
-                    masks_list.append(masks[0])
-                    item_id = f"food_id_{mask_num:d}"
-                    mask_num += 1
-                    mask_msg = self.generate_mask_msg(
-                        item_id, phrase, scores[0], masks[0], image, depth_img, box
-                    )
-                    detected_items.append(mask_msg)
-                    item_labels.append(phrase)
+                # Convert the bounding box from a tuple into a RegionOfInterest message
+                roi_msg = RegionOfInterest(
+                    x_offset=int(box[0]),
+                    y_offset=int(box[1]),
+                    height=int(box[3] - box[1]),
+                    width=int(box[2] - box[0]),
+                    do_rectify=False,
+                )
+                detected_items.append(roi_msg)
+                item_labels.append(phrase)
+                #masks, scores = self.run_efficient_sam(image, None, box, 1)
+                #if len(masks) > 0:
+                #    masks_list.append(masks[0])
+                #    item_id = f"food_id_{mask_num:d}"
+                #    mask_num += 1
+                #    mask_msg = self.generate_mask_msg(
+                #        item_id, phrase, scores[0], masks[0], image, depth_img, box
+                #    )
+                #    detected_items.append(mask_msg)
+                #    item_labels.append(phrase)
 
         result.detected_items = detected_items
         result.item_labels = item_labels
@@ -1147,7 +1195,149 @@ async def run_vision_pipeline(self, image_msg: Image, caption: str):
         )
 
         return result
+    
+    async def segment_from_bbox(
+        self, image_msg: Image, label: str, bbox: Tuple[int, int, int, int]
+    ) -> Tuple[npt.NDArray, npt.NDArray]:
+        """
+        Segment an image using a bounding box.
+
+        Parameters
+        ----------
+        image: The image to segment.
+        bbox: The bounding box to segment.
+
+        Returns
+        -------
+        masks: The masks for each segmentation.
+        scores: The confidence scores for each segmentation.
+        """
+        # Get the latest depth image and convert the depth image to OpenCV format
+        depth_img_msg = self._node.get_latest_msg(self.aligned_depth_topic)
+        depth_img = ros_msg_to_cv2_image(depth_img_msg, self.bridge)
+
+        # Convert the image to OpenCV format
+        image = ros_msg_to_cv2_image(image_msg, self.bridge)
+
+        # Run EfficientSAM on the image using the input bounding box prompt
+        masks, scores = self.run_efficient_sam(image, None, bbox, 1)
+        mask_num = 1
+        if len(masks) > 0:
+            item_id = f"food_id_{mask_num:d}"
+            mask_num += 1
+            mask_msg = self.generate_mask_msg(
+                item_id, label, scores[0], masks[0], image, depth_img, bbox
+            )
+            self.display_mask(image, masks[0], label) 
+
+        # Define the result and create result message header
+        result = SegmentFromBox.Result()
+        result.header = image_msg.header
+        if self.camera_info is None:
+            self.camera_info = self._node.get_latest_msg(self.camera_info_topic)
+        if self.camera_info is not None:
+            result.camera_info = self.camera_info
+        else:
+            self._node.get_logger().warn(
+                "Camera info not received, not including in result message"
+            )
+
+        # Set the mask message as the result
+        result.detected_item = mask_msg
+
+        return result
 
+    async def invoke_gpt4o_callback(
+        self, goal_handle: ServerGoalHandle
+    ) -> GenerateCaption.Result:
+        """
+        Callback function for the GPT-4o service. This function takes in a list
+        of string labels describing the foods on an image as input and returns a
+        caption generated by GPT-4o that compiles these labels into a descriptive
+        sentence used as a query for GroundingDINO.
+
+        Parameters
+        ----------
+        request: The given request message.
+        response: The created response message.
+
+        Returns
+        ----------
+        response: The updated response message based on the request.
+        """
+        self._node.get_logger().info("Received a new goal!")
+        starting_time = self._node.get_clock().now()
+
+        # Get the latest image and camera info
+        latest_img_msg = self._node.get_latest_msg(self.rgb_image_topic)
+        if self.camera_info is None:
+            self.camera_info = self._node.get_latest_msg(self.camera_info_topic)
+        if self.camera_info is not None:
+            camera_info = self.camera_info
+        else:
+            camera_info = None
+
+        # Check if the image and camera info are available
+        if latest_img_msg is None or camera_info is None:
+            self._node.get_logger().error("Image or camera info not available.")
+            return GenerateCaption.Result()
+
+        # Convert the image message to a CV2 image
+        image = ros_msg_to_cv2_image(latest_img_msg)
+
+        # Get the input labels from the request
+        input_labels = goal_handle.request.input_labels
+
+        # Create a rate object to control the rate at which to return feedback
+        rate = self._node.create_rate(self.rate_hz)
+
+        # Define a cleanup function to destroy the rate
+        def cleanup():
+            self._node.destroy_rate(rate)
+
+        # Start running the GPT-4o inference as a separate thread
+        gpt4o_task = self._node.executor.create_task(
+            self.run_gpt4o, latest_img_msg, input_labels
+        )
+
+        # Keep publishing feedback (elapsed time) while waiting for
+        # the GPT-4o inference to finish
+        feedback = GenerateCaption.Feedback()
+        while (
+            rclpy.ok() and not goal_handle.is_cancel_requested and not gpt4o_task.done()
+        ):
+            feedback.elapsed_time = (
+                self._node.get_clock().now() - starting_time
+            ).to_msg()
+            goal_handle.publish_feedback(feedback)
+            rate.sleep()
+
+        # If there is a cancel request, cancel the GPT-4o task
+        if goal_handle.is_cancel_requested:
+            self._node.get_logger().info("Goal cancelled.")
+            goal_handle.canceled()
+            response = GenerateCaption.Result()
+            response.status = response.STATUS_CANCELED
+
+            # Cleanup the rate
+            cleanup()
+            return response
+
+        # Set the result after inference is complete
+        self._node.get_logger().info("Goal not cancelled.")
+        self._node.get_logger().info("GPT-4o inference completed successfully.")
+        response = gpt4o_task.result()
+        goal_handle.succeed()
+        response.status = response.STATUS_SUCCEEDED
+
+        # Clear the active goal
+        with self.active_goal_request_lock:
+            self.active_goal_request = None
+
+        # Cleanup the rate
+        cleanup()
+        return response
+    
     async def execute_callback(
         self, goal_handle: ServerGoalHandle
     ) -> SegmentAllItems.Result:
@@ -1226,7 +1416,7 @@ def cleanup():
 
         # Set the result after the task has been completed
         self._node.get_logger().info("Goal not cancelled.")
-        self._node.get_logger().info("VIsion pipeline completed successfully.")
+        self._node.get_logger().info("Vision pipeline completed successfully.")
         result = vision_pipeline_task.result()
         goal_handle.succeed()
         result.status = result.STATUS_SUCCEEDED
@@ -1238,24 +1428,11 @@ def cleanup():
         # Cleanup the rate
         cleanup()
         return result
-
-    async def invoke_gpt4o_callback(
+    
+    async def segmentation_callback(
         self, goal_handle: ServerGoalHandle
-    ) -> GenerateCaption.Result:
+    ) -> SegmentFromBox.Result:
         """
-        Callback function for the GPT-4o service. This function takes in a list
-        of string labels describing the foods on an image as input and returns a
-        caption generated by GPT-4o that compiles these labels into a descriptive
-        sentence used as a query for GroundingDINO.
-
-        Parameters
-        ----------
-        request: The given request message.
-        response: The created response message.
-
-        Returns
-        ----------
-        response: The updated response message based on the request.
         """
         self._node.get_logger().info("Received a new goal!")
         starting_time = self._node.get_clock().now()
@@ -1272,31 +1449,39 @@ async def invoke_gpt4o_callback(
         # Check if the image and camera info are available
         if latest_img_msg is None or camera_info is None:
             self._node.get_logger().error("Image or camera info not available.")
-            return GenerateCaption.Result()
-
-        # Convert the image message to a CV2 image
-        image = ros_msg_to_cv2_image(latest_img_msg)
-
-        # Get the input labels from the request
-        input_labels = goal_handle.request.input_labels
-
-        # Create a rate object to control the rate at which to return feedback
+            return SegmentFromBox.Result()
+
+        # Get the desired bounding box to segment and its semantic label from the goal request
+        roi_msg = goal_handle.request.region_of_interest
+        label = goal_handle.request.label
+
+        # Store the top left and bottom right coordinates of the bounding box given
+        # the RegionOfInterest message as a list for input to the segmentation model
+        bbox_xyxy = [
+            roi_msg.x_offset, 
+            roi_msg.y_offset, 
+            roi_msg.x_offset + roi_msg.width, 
+            roi_msg.y_offset + roi_msg.height
+        ]
+
+        # Create a rate object to control the rate of the segmentation task
         rate = self._node.create_rate(self.rate_hz)
 
         # Define a cleanup function to destroy the rate
         def cleanup():
             self._node.destroy_rate(rate)
 
-        # Start running the GPT-4o inference as a separate thread
-        gpt4o_task = self._node.executor.create_task(
-            self.run_gpt4o, latest_img_msg, input_labels
+        # Start running the segmentation task as a separate thread
+        segment_from_bbox_task = self._node.executor.create_task(
+            self.segment_from_bbox, latest_img_msg, label, bbox_xyxy
         )
 
-        # Keep publishing feedback (elapsed time) while waiting for
-        # the GPT-4o inference to finish
-        feedback = GenerateCaption.Feedback()
+        # Publish feedback (elapsed time) until segmentation is complete
+        feedback = SegmentFromBox.Feedback()
         while (
-            rclpy.ok() and not goal_handle.is_cancel_requested and not gpt4o_task.done()
+            rclpy.ok()
+            and not goal_handle.is_cancel_requested
+            and not segment_from_bbox_task.done()
         ):
             feedback.elapsed_time = (
                 self._node.get_clock().now() - starting_time
@@ -1304,23 +1489,27 @@ def cleanup():
             goal_handle.publish_feedback(feedback)
             rate.sleep()
 
-        # If there is a cancel request, cancel the GPT-4o task
+        # If there is a cancel request, cancel the vision pipeline task
         if goal_handle.is_cancel_requested:
             self._node.get_logger().info("Goal cancelled.")
             goal_handle.canceled()
-            response = GenerateCaption.Result()
-            response.status = response.STATUS_CANCELED
+            result = SegmentFromBox.Result()
+            result.status = result.STATUS_CANCELLED
+
+            # Clear the active goal
+            with self.active_goal_request_lock:
+                self.active_goal_request = None
 
             # Cleanup the rate
             cleanup()
-            return response
+            return result
 
-        # Set the result after inference is complete
+        # Set the result after the task has been completed
         self._node.get_logger().info("Goal not cancelled.")
-        self._node.get_logger().info("GPT-4o inference completed successfully.")
-        response = gpt4o_task.result()
+        self._node.get_logger().info("Segmentation task completed successfully.")
+        result = segment_from_bbox_task.result()
         goal_handle.succeed()
-        response.status = response.STATUS_SUCCEEDED
+        result.status = result.STATUS_SUCCEEDED
 
         # Clear the active goal
         with self.active_goal_request_lock:
@@ -1328,7 +1517,7 @@ def cleanup():
 
         # Cleanup the rate
         cleanup()
-        return response
+        return result
 
 
 def main(args=None):

From 7f701678be3cf269e4dfcdccb3d915c3a5d7d10e Mon Sep 17 00:00:00 2001
From: sriramk117 <skutty@uw.edu>
Date: Wed, 28 May 2025 14:31:50 -0700
Subject: [PATCH 38/42] updated prompts

---
 .../segment_all_items.py                      | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 1399df41..f3f0fcb4 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -635,33 +635,33 @@ def run_gpt4o(self, image_msg: Image, labels_list: list[str]):
 
         # Write the system and user prompts for GPT-4o
         system_query = f"""
-            You are a prompt engineer that is assigned to describe items  
-            in an image you have been queried so that a vision language model 
-            can take in your prompt as a query and use it to for classification 
-            tasks. You respond in string format and do not provide any explanation 
-            for your responses.  
+            You are a prompt engineer that is assigned to describe items
+            in an image you have been queried so that a vision language model
+            can take in your prompt as a query and use it to for classification
+            tasks. You respond in string format and do not provide any explanation
+            for your responses.
         """
         user_query = f"""
             Your objective is to generate a sentence prompt that describes the food
-            items on a plate in an image. 
-            You are given an image of a plate with food items and a list of the food items 
+            items on a plate in an image.
+            You are given an image of a plate with food items and a list of the food items
             on the plate.
             Please compile the inputs from the list into a sentence prompt that effectively
             lists the food items on the plate.
             Add qualifiers to the prompt to better visually describe the food for the VLM
             to identify. Don't add any irrelevant qualifiers.
+            Ensure that the words in the input list are included in the prompt and do not
+            get altered in the sentence. For instance the word "apple" should not be changed
+            to "apples" if "apple" is provided in the input list.
 
             Here is the input list of food items to compile into a string: {labels_list}
 
             Here are some sample responses that convey how you should format your responses:
-            
-            Food items including grapes, strawberries, blueberries, melon chunks, and 
-            carrots on a small, blue plate.
 
-            Food items including strips of grilled meat and seasoned cucumber 
-            spears arranged on a light gray plate.
+            Food items including red strawberry fruit, green kiwi slices, yellow banana slices,
+            and orange carrot sticks on a plate.
 
-            Food items including baked chicken pieces, black olives, bell pepper slices, 
+            Food items including baked chicken pieces, black olive pieces, bell pepper slices,
             and artichoke on a plate.
         """
 

From 74b8cda4f36d91b03ee1fee91a051ffd9b6c5041 Mon Sep 17 00:00:00 2001
From: sriramk117 <sriramku@outlook.com>
Date: Wed, 4 Jun 2025 15:18:05 -0700
Subject: [PATCH 39/42] temporarily removed segment from point

---
 .../ada_feeding_perception_node.py            |   2 +-
 .../ada_feeding_perception/helpers.py         |   1 +
 .../segment_all_items.py                      |   8 +-
 .../segment_from_point.py                     | 161 +++++++++---------
 start.py                                      |   2 +-
 5 files changed, 91 insertions(+), 83 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/ada_feeding_perception_node.py b/ada_feeding_perception/ada_feeding_perception/ada_feeding_perception_node.py
index a35aaa53..d8d4e2dd 100755
--- a/ada_feeding_perception/ada_feeding_perception/ada_feeding_perception_node.py
+++ b/ada_feeding_perception/ada_feeding_perception/ada_feeding_perception_node.py
@@ -188,7 +188,7 @@ def main(args=None):
     face_detection = FaceDetectionNode(node)
     food_on_fork_detection = FoodOnForkDetectionNode(node)
     segment_all_items = SegmentAllItemsNode(node)  # pylint: disable=unused-variable
-    segment_from_point = SegmentFromPointNode(node)  # pylint: disable=unused-variable
+    # segment_from_point = SegmentFromPointNode(node)  # pylint: disable=unused-variable
     table_detection = TableDetectionNode(node)
     executor = MultiThreadedExecutor(num_threads=16)
 
diff --git a/ada_feeding_perception/ada_feeding_perception/helpers.py b/ada_feeding_perception/ada_feeding_perception/helpers.py
index 982510e9..c9a02a8e 100644
--- a/ada_feeding_perception/ada_feeding_perception/helpers.py
+++ b/ada_feeding_perception/ada_feeding_perception/helpers.py
@@ -358,6 +358,7 @@ def get_img_msg_type(
 
     # Resolve the topic name (e.g., handle remappings)
     final_topic = node.resolve_topic_name(topic)
+    rclpy.logging.get_logger("ada_feeding_perception_helpers").info("Resolving topic name: " + final_topic)
 
     # Get the publishers on the topic
     topic_endpoints = node.get_publishers_info_by_topic(final_topic)
diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index f3f0fcb4..bcedfa6d 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -232,7 +232,7 @@ def __init__(self, node: ADAFeedingPerceptionNode):
             )
 
         # Initialize the OpenAI API and load environment variables
-        API_KEY = os.getenv("OPENAI_API_KEY")
+        API_KEY = ""
         self.openai = OpenAI(api_key=API_KEY)
 
     def read_params(
@@ -1245,6 +1245,12 @@ async def segment_from_bbox(
         # Set the mask message as the result
         result.detected_item = mask_msg
 
+        # Save the mask message to a file
+        filename = f"segmentation_result_{item_id}.txt"
+        with open(filename, "wb") as f:
+            f.write(result.detected_item.mask.data)
+        self._node.get_logger().info(f"Saved segmentation result to {filename}")
+
         return result
 
     async def invoke_gpt4o_callback(
diff --git a/ada_feeding_perception/ada_feeding_perception/segment_from_point.py b/ada_feeding_perception/ada_feeding_perception/segment_from_point.py
index 591ff4ba..f91a6ed6 100755
--- a/ada_feeding_perception/ada_feeding_perception/segment_from_point.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_from_point.py
@@ -193,65 +193,12 @@ def read_params(
         max_depth_mm: The maximum depth in mm to consider for a mask.
         """
         (
-            sam_model_name,
-            sam_model_base_url,
-            efficient_sam_model_name,
-            efficient_sam_model_base_url,
             model_dir,
-            use_efficient_sam,
             n_contender_masks,
             rate_hz,
-            min_depth_mm,
-            max_depth_mm,
         ) = self._node.declare_parameters(
             "",
             [
-                (
-                    "sam_model_name",
-                    None,
-                    ParameterDescriptor(
-                        name="sam_model_name",
-                        type=ParameterType.PARAMETER_STRING,
-                        description="The name of the model checkpoint to use for SAM",
-                        read_only=True,
-                    ),
-                ),
-                (
-                    "sam_model_base_url",
-                    None,
-                    ParameterDescriptor(
-                        name="sam_model_base_url",
-                        type=ParameterType.PARAMETER_STRING,
-                        description=(
-                            "The URL to download the model checkpoint from if "
-                            "it is not already downloaded for SAM"
-                        ),
-                        read_only=True,
-                    ),
-                ),
-                (
-                    "efficient_sam_model_name",
-                    None,
-                    ParameterDescriptor(
-                        name="efficient_sam_model_name",
-                        type=ParameterType.PARAMETER_STRING,
-                        description="The name of the model checkpoint to use for EfficientSAM",
-                        read_only=True,
-                    ),
-                ),
-                (
-                    "efficient_sam_model_base_url",
-                    None,
-                    ParameterDescriptor(
-                        name="efficient_sam_model_base_url",
-                        type=ParameterType.PARAMETER_STRING,
-                        description=(
-                            "The URL to download the model checkpoint from if "
-                            "it is not already downloaded for EfficientSAM"
-                        ),
-                        read_only=True,
-                    ),
-                ),
                 (
                     "segment_from_point_model_dir",
                     None,
@@ -265,16 +212,6 @@ def read_params(
                         read_only=True,
                     ),
                 ),
-                (
-                    "use_efficient_sam",
-                    True,
-                    ParameterDescriptor(
-                        name="use_efficient_sam",
-                        type=ParameterType.PARAMETER_BOOL,
-                        description=("Whether to use EfficientSAM or SAM"),
-                        read_only=True,
-                    ),
-                ),
                 (
                     "n_contender_masks",
                     3,
@@ -295,27 +232,91 @@ def read_params(
                         read_only=True,
                     ),
                 ),
-                (
-                    "min_depth_mm",
-                    330,
-                    ParameterDescriptor(
-                        name="min_depth_mm",
-                        type=ParameterType.PARAMETER_INTEGER,
-                        description="The minimum depth in mm to consider in a mask.",
-                        read_only=True,
+            ],
+        )
+
+        use_efficient_sam = self._node.get_parameter(
+            "use_efficient_sam",
+            Parameter(
+                "use_efficient_sam",
+                ParameterType.PARAMETER_BOOL,
+                False,
+                ParameterDescriptor(
+                    name="use_efficient_sam",
+                    type=ParameterType.PARAMETER_BOOL,
+                    description="Whether to use EfficientSAM or SAM.",
+                    read_only=True,
+                ),
+            ),
+        )
+
+        sam_model_name = self._node.get_parameter(
+            "sam_model_name",
+            Parameter(
+                "sam_model_name",
+                ParameterType.PARAMETER_STRING,
+                "vit_h_4b8939.pth",
+                ParameterDescriptor(
+                    name="sam_model_name",
+                    type=ParameterType.PARAMETER_STRING,
+                    description=(
+                        "The name of the Segment Anything model checkpoint to use. "
+                        "This should be the name of a file in the model directory."
                     ),
+                    read_only=True,
                 ),
-                (
-                    "max_depth_mm",
-                    10150000,
-                    ParameterDescriptor(
-                        name="max_depth_mm",
-                        type=ParameterType.PARAMETER_INTEGER,
-                        description="The maximum depth in mm to consider in a mask.",
-                        read_only=True,
+            ),
+        )
+        sam_model_base_url = self._node.get_parameter(
+            "sam_model_base_url",
+            Parameter(
+                "sam_model_base_url",
+                ParameterType.PARAMETER_STRING,
+                "https://dl.fbaipublicfiles.com/segment_anything/",
+                ParameterDescriptor(
+                    name="sam_model_base_url",
+                    type=ParameterType.PARAMETER_STRING,
+                    description=(
+                        "The URL to download the Segment Anything model checkpoint from "
+                        "if it is not already downloaded."
                     ),
+                    read_only=True,
                 ),
-            ],
+            ),
+        )
+        efficient_sam_model_name = self._node.get_parameter(
+            "efficient_sam_model_name",
+            Parameter(      
+                "efficient_sam_model_name",
+                ParameterType.PARAMETER_STRING,
+                "vits_l.pth",
+                ParameterDescriptor(
+                    name="efficient_sam_model_name",
+                    type=ParameterType.PARAMETER_STRING,
+                    description=(
+                        "The name of the EfficientSAM model checkpoint to use. "
+                        "This should be the name of a file in the model directory."
+                    ),
+                    read_only=True,
+                ),
+            ),
+        )
+        efficient_sam_model_base_url = self._node.get_parameter(
+            "efficient_sam_model_base_url",
+            None,
+            Parameter(
+                "efficient_sam_model_base_url",
+                ParameterType.PARAMETER_STRING,
+                "https://dl.fbaipublicfiles.com/segment_anything/",
+                ParameterDescriptor(
+                    name="efficient_sam_model_base_url",
+                    type=ParameterType.PARAMETER_STRING,
+                    description=(
+                        "The URL to download the EfficientSAM model checkpoint from "
+                        "if it is not already downloaded."
+                    ),
+                ),
+            ),
         )
 
         if use_efficient_sam.value:
diff --git a/start.py b/start.py
index b0eb8662..70d1b315 100755
--- a/start.py
+++ b/start.py
@@ -288,7 +288,7 @@ async def main(args: argparse.Namespace, pwd: str) -> None:
                 f"ros2 launch ada_planning_scene ada_moveit_launch.xml use_rviz:={'true' if args.dev else 'false'}",
             ],
             "feeding": [
-                "sudo ./src/ada_feeding/configure_lovelace.sh",
+                # "sudo ./src/ada_feeding/configure_lovelace.sh",
                 (
                     "ros2 launch ada_feeding ada_feeding_launch.xml "
                     f"use_estop:={'false' if args.dev else 'true'} run_web_bridge:=false policy:={args.policy}"

From e7f7282d76e5ae75ff79defbe84e84667a88b95f Mon Sep 17 00:00:00 2001
From: sriramk117 <sriramku@outlook.com>
Date: Wed, 4 Jun 2025 17:30:47 -0700
Subject: [PATCH 40/42] implemented nms to remove dino prediction duplicates

---
 .../segment_all_items.py                      | 164 +++++++++++++++---
 1 file changed, 141 insertions(+), 23 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index bcedfa6d..30644579 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -814,6 +814,100 @@ def run_efficient_sam(
         scores = predicted_iou[0, 0, :].cpu().detach().numpy()
 
         return masks, scores
+    
+    def calculate_iou(box1: npt.NDArray, box2: npt.NDArray) -> float:
+        """
+        Calculate the Intersection over Union (IoU) between two bounding boxes.
+
+        Parameters
+        ----------
+        box1: The first bounding box in the format [x1, y1, x2, y2].
+        box2: The second bounding box in the format [x1, y1, x2, y2].
+
+        Returns
+        -------
+        iou: The IoU between the two bounding boxes.
+        """
+        # Calculate the coordinates of the intersection rectangle
+        x1_inter = max(box1[0], box2[0])
+        y1_inter = max(box1[1], box2[1])
+        x2_inter = min(box1[2], box2[2])
+        y2_inter = min(box1[3], box2[3])
+
+        # Calculate the area of the intersection rectangle
+        inter_area = max(0, x2_inter - x1_inter) * max(0, y2_inter - y1_inter)
+
+        # Calculate the area of both bounding boxes
+        box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+        box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+
+        # Calculate the IoU
+        iou = inter_area / float(box1_area + box2_area - inter_area)
+
+        return iou
+
+    def non_max_suppression(self, predictions, iou_threshold: float = 0.5):
+        boxes = [p["box"] for p in predictions]
+        conf_scores = torch.tensor([p["confidence"] for p in predictions])
+        phrases = [p["phrase"] for p in predictions]
+
+        # Extract coordinates from predicted boxes
+        x0 = torch.tensor([box[0] for box in boxes])
+        y0 = torch.tensor([box[1] for box in boxes])
+        x1 = torch.tensor([box[2] for box in boxes])
+        y1 = torch.tensor([box[3] for box in boxes])
+
+        # Calculate areas of predicted boxes and sort by confidence 
+        areas = (x1 - x0) * (y1 - y0)
+        indices = conf_scores.argsort()
+
+        # Initialize a list to hold the selected boxes
+        selected_boxes = []
+
+        while len(indices) > 0:
+            # Select the box with the highest confidence score
+            current_index = indices[-1]
+            selected_boxes.append({"box": boxes[current_index], "phrase": phrases[current_index], "confidence": conf_scores[current_index]})
+
+            indices = indices[:-1]
+
+            # Select coordinates of boxes to compare with the current box
+            x0_coords = torch.index_select(x0, 0, indices)
+            y0_coords = torch.index_select(y0, 0, indices)
+            x1_coords = torch.index_select(x1, 0, indices)
+            y1_coords = torch.index_select(y1, 0, indices)
+
+            # Determine coordinates of intersection boxes
+            inter_x0 = torch.max(x0[current_index], x0_coords)
+            inter_y0 = torch.max(y0[current_index], y0_coords)
+            inter_x1 = torch.min(x1[current_index], x1_coords)
+            inter_y1 = torch.min(y1[current_index], y1_coords)
+
+            # Calculate heights and widths of the intersection boxes
+            widths = inter_x1 - inter_x0
+            heights = inter_y1 - inter_y0
+
+            # Clamp the heights and width to avoid negative values
+            widths = torch.clamp(widths, min=0.0)
+            heights = torch.clamp(heights, min=0.0)
+
+            # Calculate the intersection area
+            inter_area = widths * heights
+
+            # Calculate the regular areas of the remaining boxes
+            areas_remaining = torch.index_select(areas, 0, indices)
+
+            # Find the union ares of the predicted boxes in the current iteration
+            # with the selected box
+            union_area = areas_remaining + areas[current_index] - inter_area
+
+            # Find IoU of predicted boxes with the selected box
+            iou = inter_area / union_area
+
+            # Remove boxes with IoU below the threshold
+            indices = indices[iou < iou_threshold]
+
+        return selected_boxes
 
     def run_grounding_dino(
         self,
@@ -844,20 +938,14 @@ def run_grounding_dino(
         # desired image and text prompts.
         inference_time = time.time()
 
-        # Convert image to Image pillow
-        image_pil, image_transformed = self.load_image(image)
+        # Preprocess the image for GroundingDINO detection
+        _, image_transformed = self.load_image(image)
 
         # Lowercase and strip the caption
         caption = caption.lower().strip()
 
         # Run GroundingDINO on the image using the input caption
         image_transformed = image_transformed.to(device=self.device)
-
-        # Display image transformed
-        image_pil.show()
-        #cv2.imshow("transformed", image_transformed)
-        #cv2.waitKey(0)
-        
         with torch.no_grad():
             outputs = self.groundingdino(
                 image_transformed[None],
@@ -865,7 +953,6 @@ def run_grounding_dino(
             )
         logits = outputs["pred_logits"].sigmoid()[0]
         boxes = outputs["pred_boxes"][0]
-        self._node.get_logger().info("... Done")
 
         # Filter the output based on the box and text thresholds
         boxes_cxcywh = {}
@@ -878,42 +965,73 @@ def run_grounding_dino(
         # Tokenize the caption
         tokenizer = self.groundingdino.tokenizer
         caption_tokens = tokenizer(caption)
-
+            
         # Build the dictionary of bounding boxes for each food item label detected
         for logit, box in zip(logits_filt, boxes_filt):
             # Predict phrases based on the bounding boxes and the text threshold
-            phrase = get_phrases_from_posmap(
-                logit > text_threshold, caption_tokens, tokenizer
-            )
+            phrase = get_phrases_from_posmap(logit > text_threshold, caption_tokens, tokenizer)
+            confidence_score = logit.max().item()
+
+            #print(f"{phrase}")
             if phrase not in boxes_cxcywh:
                 boxes_cxcywh[phrase] = []
-            boxes_cxcywh[phrase].append(box.cpu().numpy())
+            #boxes_cxcywh[phrase].append(box.cpu().numpy())
+            boxes_cxcywh[phrase].append({"box": box.cpu().numpy(), "confidence": confidence_score})
 
         # Define height and width of image
-        height, width, _ = image.shape
+        if isinstance(image, torch.Tensor):
+            _, height, width = image.shape
+        else:
+            height, width, _ = image.shape
 
         # Convert the bounding boxes outputted by GroundingDINO to the following format
         # [top left x-value, top left y-value, bottom right x-value, bottom right y-value]
         # and unnormalize the bounding box coordinate values
-        boxes_xyxy = {}
+        # boxes_xyxy = {}
+        box_list = []
         for phrase, boxes in boxes_cxcywh.items():
-            boxes_xyxy[phrase] = []
+            # boxes_xyxy[phrase] = []
             for box in boxes:
                 # Scale the box from percentage values to pixel values
-                box = np.multiply(box, np.array([width, height, width, height]))
-                center_x, center_y, w, h = box
+                scaled_box = np.multiply(box["box"], np.array([width, height, width, height]))
+                logit = box["confidence"]
+                center_x, center_y, w, h = scaled_box
+
                 # Get the bottom left and top right coordinates of the box
                 x0 = center_x - (w / 2)
                 y0 = center_y - (h / 2)
                 x1 = x0 + w
                 y1 = y0 + h
-                boxes_xyxy[phrase].append([x0, y0, x1, y1])
+                # boxes_xyxy[phrase].append([x0, y0, x1, y1])
+                box_list.append({"phrase": phrase, "box": [x0, y0, x1, y1], "confidence": logit})
 
         # Measure the elapsed time running GroundingDINO on the image prompt
         inference_time = int(round((time.time() - inference_time) * 1000))
-
-        return boxes_xyxy
-
+        self._node.get_logger().info(f"GroundingDINO inference time: {inference_time} ms")
+        start_time = time.time()
+
+        # Perform non-maximum suppression to remove overlapping bounding boxes
+        # and select the most confident bounding boxes
+        selected_boxes = self.non_max_suppression(box_list, iou_threshold=0.8)
+        pre_nms_num_boxes = len(box_list)
+
+        predictions = {}
+        for box in selected_boxes:
+            phrase = box["phrase"]
+            box_coords = box["box"]
+            if phrase not in predictions:
+                predictions[phrase] = []
+            predictions[phrase].append(box_coords)
+        post_nms_num_boxes = len(selected_boxes)
+
+        self._node.get_logger().info(f"Pre-NMS: {pre_nms_num_boxes}, Post-NMS: {post_nms_num_boxes}")
+
+        # Measure elapsed time of running non-maximum suppression
+        nms_time = int(round((time.time() - start_time) * 1000))
+        self._node.get_logger().info(f"Non-maximum suppression time: {nms_time} ms")
+
+        return predictions
+    
     def load_image(self, image_array: npt.NDArray):
         """
         Load the image and apply transformations to it.

From 3a98848c48c6803e6a5cfe1ff6a766ea183f5129 Mon Sep 17 00:00:00 2001
From: sriramk117 <sriramku@outlook.com>
Date: Wed, 4 Jun 2025 18:10:10 -0700
Subject: [PATCH 41/42] added nearest label interpolation

---
 .../segment_all_items.py                      | 143 +++++++++++++-----
 1 file changed, 109 insertions(+), 34 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 30644579..47be4c3d 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -8,7 +8,7 @@
 # Standard imports
 import os
 import threading
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Union, Dict, List
 
 # Third-party imports
 import cv2
@@ -40,6 +40,8 @@
 import base64
 from openai import OpenAI
 from dotenv import load_dotenv
+from transformers import BertTokenizer, BertModel
+import torch.nn.functional as F
 
 # Local imports
 from ada_feeding_msgs.action import SegmentAllItems, GenerateCaption, SegmentFromBox
@@ -180,6 +182,12 @@ def __init__(self, node: ADAFeedingPerceptionNode):
         else:
             self.initialize_sam(seg_model_name, seg_model_path)
 
+        # Initialize the labels list to store the input labels
+        self.labels_list: list[str] = []
+
+        # Initialize BERT for label embedding
+        self.bert_tokenizer, self.bert_model = self.initialize_bert()
+
         # Convert between ROS and CV images
         self.bridge = CvBridge()
 
@@ -556,6 +564,19 @@ def initialize_efficient_sam(self, model_name: str, model_path: str) -> None:
 
         self._node.get_logger().info("...Done!")
 
+    def initialize_bert(self) -> Tuple[BertTokenizer, BertModel]:
+        """
+        Load the BERT model for label embedding.
+
+        Returns
+        -------
+        tokenizer (BertTokenizer): The BERT tokenizer.
+        model (BertModel): The BERT model.
+        """
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        model = BertModel.from_pretrained("bert-base-uncased")
+        return tokenizer, model
+
     def goal_callback(self, goal_request: SegmentAllItems.Goal) -> GoalResponse:
         """
         Accept or reject the goal request based on the availability of the latest
@@ -815,37 +836,6 @@ def run_efficient_sam(
 
         return masks, scores
     
-    def calculate_iou(box1: npt.NDArray, box2: npt.NDArray) -> float:
-        """
-        Calculate the Intersection over Union (IoU) between two bounding boxes.
-
-        Parameters
-        ----------
-        box1: The first bounding box in the format [x1, y1, x2, y2].
-        box2: The second bounding box in the format [x1, y1, x2, y2].
-
-        Returns
-        -------
-        iou: The IoU between the two bounding boxes.
-        """
-        # Calculate the coordinates of the intersection rectangle
-        x1_inter = max(box1[0], box2[0])
-        y1_inter = max(box1[1], box2[1])
-        x2_inter = min(box1[2], box2[2])
-        y2_inter = min(box1[3], box2[3])
-
-        # Calculate the area of the intersection rectangle
-        inter_area = max(0, x2_inter - x1_inter) * max(0, y2_inter - y1_inter)
-
-        # Calculate the area of both bounding boxes
-        box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
-        box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
-
-        # Calculate the IoU
-        iou = inter_area / float(box1_area + box2_area - inter_area)
-
-        return iou
-
     def non_max_suppression(self, predictions, iou_threshold: float = 0.5):
         boxes = [p["box"] for p in predictions]
         conf_scores = torch.tensor([p["confidence"] for p in predictions])
@@ -908,6 +898,82 @@ def non_max_suppression(self, predictions, iou_threshold: float = 0.5):
             indices = indices[iou < iou_threshold]
 
         return selected_boxes
+    
+    def get_label_embedding(self, label: str, tokenizer: BertTokenizer, model: BertModel) -> torch.Tensor:
+        """
+        Get the label embedding for a given label using BERT.
+
+        Parameters
+        ----------
+        label (str): The label for which to get the embedding.
+        tokenizer (BertTokenizer): The BERT tokenizer.
+        model (BertModel): The BERT model.
+
+        Returns
+        -------
+        embedding (torch.Tensor): The BERT embedding of the label.
+        """
+        # Tokenize and encode the label
+        inputs = tokenizer(label, return_tensors="pt")
+        
+        # Get the embeddings from BERT
+        with torch.no_grad():
+            outputs = model(**inputs)
+        
+        final_hidden_state = outputs.last_hidden_state
+        label_embedding = torch.tensor(final_hidden_state.mean(dim=1))
+        
+        # Return the last hidden state as the embedding
+        return label_embedding
+    
+    def nearest_label_interpolation(self, predictions: Dict, gt_labels: List[str], tokenizer: BertTokenizer, model: BertModel) -> Dict:
+        """
+        Interpolate predicted labels to match ground truth labels by matching
+        labels together based on their embedding similarity. Then, create a new
+        prediction dictionary (label to masks) with the interpolated labels as keys. 
+        
+        Parameters
+        ----------
+        predictions (Dict): Dictionary of predicted masks.
+        gt_labels (List[str]): List of ground truth labels.
+
+        Returns
+        -------
+        interpolated_predictions (Dict): Dictionary of interpolated predictions.
+        """
+        if predictions is None or len(predictions) == 0:
+            print("No predictions found.")
+            return {}
+        
+        # Initialize the interpolated predictions dictionary
+        interpolated_predictions = {}
+
+        # Convert predicted labels to BERT embeddings
+        pred_labels = list(predictions.keys())
+        #print(f"Predicted labels: {pred_labels}")
+        pred_embeddings = [self.get_label_embedding(label, tokenizer, model) for label in pred_labels]
+        pred_embeddings = torch.stack(pred_embeddings)
+
+        # Convert ground truth labels to BERT embeddings
+        gt_embeddings = [self.get_label_embedding(label, tokenizer, model) for label in gt_labels]
+        gt_embeddings = torch.stack(gt_embeddings)
+
+        # Calculate the cosine similarity between predicted and ground truth embeddings
+        similarity_matrix = F.cosine_similarity(pred_embeddings.unsqueeze(1), gt_embeddings.unsqueeze(0), dim=-1)
+        #print(f"Similarity matrix shape: {similarity_matrix.shape}")
+
+        # Find the nearest ground truth label for each predicted label
+        nearest_labels = torch.max(similarity_matrix, dim=1)[1]
+        old_to_new = {pred_labels[i]: gt_labels[nearest_labels[i]] for i in range(len(pred_labels))}
+
+        # Create the new prediction dictionary with interpolated labels
+        for pred_label, boxes in predictions.items():
+            new_label = old_to_new[pred_label]
+            if new_label not in interpolated_predictions:
+                interpolated_predictions[new_label] = []
+            interpolated_predictions[new_label].extend(boxes)
+
+        return interpolated_predictions
 
     def run_grounding_dino(
         self,
@@ -1030,8 +1096,13 @@ def run_grounding_dino(
         nms_time = int(round((time.time() - start_time) * 1000))
         self._node.get_logger().info(f"Non-maximum suppression time: {nms_time} ms")
 
-        return predictions
-    
+        # Interpolate the predictions for each image to the nearest label
+        interpolated_predictions = {}
+        interpolated = self.nearest_label_interpolation(predictions, self.labels_list, self.bert_tokenizer, self.bert_model)
+        interpolated_predictions = interpolated
+
+        return interpolated_predictions
+
     def load_image(self, image_array: npt.NDArray):
         """
         Load the image and apply transformations to it.
@@ -1412,6 +1483,10 @@ async def invoke_gpt4o_callback(
         # Get the input labels from the request
         input_labels = goal_handle.request.input_labels
 
+        # Store the input labels list
+        self.labels_list = input_labels
+        self._node.get_logger().info(f"Input labels: {self.input_labels}")
+
         # Create a rate object to control the rate at which to return feedback
         rate = self._node.create_rate(self.rate_hz)
 

From 914d0a0dfa1c1cc0eeabc00863153c9a925f4465 Mon Sep 17 00:00:00 2001
From: sriramk117 <sriramku@outlook.com>
Date: Fri, 13 Jun 2025 00:22:26 -0700
Subject: [PATCH 42/42] added acquire food client for robot demos

---
 .../segment_all_items.py                      | 56 ++++++++++++++++---
 cyclonedds.xml                                |  2 +-
 2 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
index 47be4c3d..9b30bed7 100644
--- a/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
+++ b/ada_feeding_perception/ada_feeding_perception/segment_all_items.py
@@ -42,9 +42,14 @@
 from dotenv import load_dotenv
 from transformers import BertTokenizer, BertModel
 import torch.nn.functional as F
+import base64
+import yaml
+import json
+from rosidl_runtime_py.convert import message_to_ordereddict
+# from rosidl_runtime_py.utilities import convert_message_to_ordereddict  # requires ROS2
 
 # Local imports
-from ada_feeding_msgs.action import SegmentAllItems, GenerateCaption, SegmentFromBox
+from ada_feeding_msgs.action import SegmentAllItems, GenerateCaption, SegmentFromBox, AcquireFood 
 from ada_feeding_msgs.msg import Mask
 from ada_feeding_perception.helpers import (
     BoundingBox,
@@ -1097,11 +1102,11 @@ def run_grounding_dino(
         self._node.get_logger().info(f"Non-maximum suppression time: {nms_time} ms")
 
         # Interpolate the predictions for each image to the nearest label
-        interpolated_predictions = {}
-        interpolated = self.nearest_label_interpolation(predictions, self.labels_list, self.bert_tokenizer, self.bert_model)
-        interpolated_predictions = interpolated
+        # interpolated_predictions = {}
+        # interpolated = self.nearest_label_interpolation(predictions, self.labels_list, self.bert_tokenizer, self.bert_model)
+        # interpolated_predictions = interpolated
 
-        return interpolated_predictions
+        return predictions
 
     def load_image(self, image_array: npt.NDArray):
         """
@@ -1434,11 +1439,44 @@ async def segment_from_bbox(
         # Set the mask message as the result
         result.detected_item = mask_msg
 
-        # Save the mask message to a file
+        # self._node.get_logger().info(
+        #     f"Segmented items: {result}"
+        # )
+        # # Save the mask message to a file
         filename = f"segmentation_result_{item_id}.txt"
-        with open(filename, "wb") as f:
-            f.write(result.detected_item.mask.data)
-        self._node.get_logger().info(f"Saved segmentation result to {filename}")
+        # with open(filename, "w") as f:
+        #     base64_str = base64.b64encode(result).decode("utf-8")
+        #     f.write(base64_str)
+        # self._node.get_logger().info(f"Saved segmentation result to {filename}")
+
+        # Create an AcquireFood Goal message 
+        acquire_goal = AcquireFood.Goal()
+        acquire_goal.header = image_msg.header
+        acquire_goal.camera_info = result.camera_info
+        acquire_goal.detected_food = result.detected_item
+
+        result_dict = message_to_ordereddict(acquire_goal)
+        # with open(filename.replace(".txt", ".yaml"), "w") as f:
+        #     yaml.dump(result_dict, f, sort_keys=False)
+        # Convert OrderedDict to plain dict via JSON round-trip
+        clean_dict = json.loads(json.dumps(result_dict))
+
+        # Save to YAML
+        filename = f"segmentation_result_{item_id}.yaml"
+        with open(filename, "w") as f:
+            yaml.dump(clean_dict, f, sort_keys=False)
+
+        with open(filename, 'r') as f:
+            yaml_obj = yaml.safe_load(f)
+
+        # Convert to compact JSON string (which ROS CLI accepts)
+        goal_str = json.dumps(yaml_obj)
+        self._node.get_logger().info(f"Goal string: {goal_str}")
+
+        # Save goal_str to a text file
+        cmd_filename = f"acquire_food_goal_{item_id}.txt"
+        with open(cmd_filename, "w") as f:
+            f.write(goal_str)
 
         return result
 
diff --git a/cyclonedds.xml b/cyclonedds.xml
index 884f8558..9510949a 100644
--- a/cyclonedds.xml
+++ b/cyclonedds.xml
@@ -8,7 +8,7 @@
     <General>
       <Interfaces>
         <NetworkInterface name="lo" priority="default" multicast="true"/>
-        <NetworkInterface name="wlp0s20f3" priority="default" multicast="true"/>
+        <NetworkInterface name="wlx001325ae60f1" priority="default" multicast="true"/>
       </Interfaces>
     </General>
   </Domain>