add thresholds per object for owl_predictor

ssmmoo1 · ssmmoo1 · commit 3acb74864892 · 2023-12-28T11:03:16.000-06:00
diff --git a/examples/owl_predict.py b/examples/owl_predict.py
@@ -30,21 +30,25 @@
 
     parser = argparse.ArgumentParser()
     parser.add_argument("--image", type=str, default="../assets/owl_glove_small.jpg")
-    parser.add_argument("--prompt", type=str, default="")
-    parser.add_argument("--threshold", type=float, default=0.1)
+    parser.add_argument("--prompt", type=str, default="an owl, a glove")
+    parser.add_argument("--thresholds", type=str, default="0.1,0.1")
     parser.add_argument("--output", type=str, default="../data/owl_predict_out.jpg")
     parser.add_argument("--model", type=str, default="google/owlvit-base-patch32")
-    parser.add_argument("--image_encoder_engine", type=str, default="../data/owlvit_image_encoder_patch32.engine")
+    parser.add_argument("--image_encoder_engine", type=str, default="../data/owl_image_encoder_patch32.engine")
     parser.add_argument("--profile", action="store_true")
     parser.add_argument("--num_profiling_runs", type=int, default=30)
     args = parser.parse_args()
 
     prompt = args.prompt.strip("][()")
-
     text = prompt.split(',')
-    
     print(text)
 
+    thresholds = args.thresholds.strip("][()")
+    thresholds = thresholds.split(',')
+    thresholds = [float(x) for x in thresholds]
+    print(thresholds)
+    
+
     predictor = OwlPredictor(
         args.model,
         image_encoder_engine=args.image_encoder_engine
@@ -58,7 +62,7 @@
         image=image, 
         text=text, 
         text_encodings=text_encodings,
-        threshold=args.threshold,
+        thresholds=thresholds,
         pad_square=False
     )
 
@@ -70,7 +74,7 @@
                 image=image, 
                 text=text, 
                 text_encodings=text_encodings,
-                threshold=args.threshold,
+                thresholds=thresholds,
                 pad_square=False
             )
         torch.cuda.current_stream().synchronize()
diff --git a/nanoowl/owl_predictor.py b/nanoowl/owl_predictor.py
@@ -274,7 +274,7 @@ def encode_rois(self, image: torch.Tensor, rois: torch.Tensor, pad_square: bool
     def decode(self, 
             image_output: OwlEncodeImageOutput, 
             text_output: OwlEncodeTextOutput,
-            threshold: float = 0.1
+            thresholds: List[float],
         ) -> OwlDecodeOutput:
 
         num_input_images = image_output.image_class_embeds.shape[0]
@@ -290,8 +290,16 @@ def decode(self,
         scores_max = scores_sigmoid.max(dim=-1)
         labels = scores_max.indices
         scores = scores_max.values
-
-        mask = (scores > threshold)
+        masks = []
+        for i, threshold in enumerate(thresholds):
+            label_mask = labels == i
+            score_mask = scores > threshold 
+            obj_mask = torch.logical_and(label_mask,score_mask)
+            masks.append(obj_mask) 
+        
+        mask = masks[0]
+        for mask_t in masks[1:]:
+            mask = torch.logical_or(mask, mask_t)
 
         input_indices = torch.arange(0, num_input_images, dtype=labels.dtype, device=labels.device)
         input_indices = input_indices[:, None].repeat(1, self.num_patches)
@@ -447,8 +455,9 @@ def predict(self,
             image: PIL.Image, 
             text: List[str], 
             text_encodings: Optional[OwlEncodeTextOutput],
+            thresholds: List[float],
             pad_square: bool = True,
-            threshold: float = 0.1
+            
         ) -> OwlDecodeOutput:
 
         image_tensor = self.image_preprocessor.preprocess_pil_image(image)
@@ -460,5 +469,5 @@ def predict(self,
 
         image_encodings = self.encode_rois(image_tensor, rois, pad_square=pad_square)
 
-        return self.decode(image_encodings, text_encodings, threshold)
+        return self.decode(image_encodings, text_encodings, thresholds)