perf: optimize vision token detection using torch.isin

Sigrid Jin (Sionic AI) · Sigrid Jin (Sionic AI) · commit 702fd162f798 · 2025-07-18T17:04:44.000+09:00
Implement efficiency improvements suggested by DarkLight1337:
- Consolidate get_pooling_params method for "embed" and "encode" tasks
- Pre-compute vision token IDs tensor in constructor
- Replace range checks with torch.isin for more efficient vision token detection
  at lines 209-210 and 261-262

This reduces redundant code and improves performance when checking for
vision tokens by using optimized tensor operations.

Signed-off-by: Sigrid Jin (Sionic AI) &lt;sigrid@sionic.ai&gt;
diff --git a/vllm/model_executor/models/jina_embeddings_v4.py b/vllm/model_executor/models/jina_embeddings_v4.py
@@ -50,6 +50,10 @@ def __init__(self,
         self.pooling_backend = pooling_backend
         self.observability_config = vllm_config.observability_config
 
+        # Pre-compute vision token IDs tensor for efficient checking
+        self.vision_token_ids = torch.tensor(
+            [VISION_START_TOKEN_ID, VISION_END_TOKEN_ID], dtype=torch.long)
+
         # Performance tracking
         self._pooling_time_ms = 0.0
         self._pooling_count = 0
@@ -64,10 +68,7 @@ def __init__(self,
 
     def get_pooling_params(self, task: PoolingTask) -> Optional[PoolingParams]:
         """Return pooling params for embedding task."""
-        if task == "embed":
-            return PoolingParams(logits_processing_needs_token_ids=True)
-
-        if task == "encode":
+        if task == "embed" or task == "encode":
             return PoolingParams(logits_processing_needs_token_ids=True)
 
         # The equalities are split up to keep mypy happy
@@ -206,8 +207,8 @@ def _apply_vision_pooling_optimized(
                                  dtype=hidden_states.dtype)
 
             # Check for vision tokens
-            has_vision = torch.any((token_tensor >= VISION_START_TOKEN_ID)
-                                   & (token_tensor <= VISION_END_TOKEN_ID))
+            has_vision = torch.isin(token_tensor,
+                                    self.vision_token_ids.to(device)).any()
 
             if has_vision:
                 # Use Triton kernel for vision token extraction
@@ -258,8 +259,8 @@ def _apply_vision_pooling_pytorch(
                                       device=hidden_states.device)
 
             # Check for vision tokens
-            vision_mask = ((seq_tokens >= VISION_START_TOKEN_ID) &
-                           (seq_tokens <= VISION_END_TOKEN_ID))
+            vision_mask = torch.isin(
+                seq_tokens, self.vision_token_ids.to(seq_tokens.device))
 
             if vision_mask.any():
                 # Pool only vision tokens