refactor: revert to range checks for vision token detection

Sigrid Jin (Sionic AI) · Sigrid Jin (Sionic AI) · commit 6b31c668c432 · 2025-07-18T17:49:23.000+09:00
Revert the torch.isin optimization as pointed out by DarkLight1337.
The torch.isin approach only matches tokens that are exactly
vision_start_id or vision_end_id, but we need to match ALL tokens
in the range [vision_start_id, vision_end_id].

The range check correctly handles:
- All tokens within the range (not just endpoints)
- Future expansion of the vision token range
- Proper semantic intent of the code

Keep the get_pooling_params consolidation as that change is correct.

Signed-off-by: Sigrid Jin (Sionic AI) &lt;sigrid@sionic.ai&gt;
diff --git a/vllm/model_executor/models/jina_embeddings_v4.py b/vllm/model_executor/models/jina_embeddings_v4.py
@@ -50,10 +50,6 @@ def __init__(self,
         self.pooling_backend = pooling_backend
         self.observability_config = vllm_config.observability_config
 
-        # Pre-compute vision token IDs tensor for efficient checking
-        self.vision_token_ids = torch.tensor(
-            [VISION_START_TOKEN_ID, VISION_END_TOKEN_ID], dtype=torch.long)
-
         # Performance tracking
         self._pooling_time_ms = 0.0
         self._pooling_count = 0
@@ -207,8 +203,8 @@ def _apply_vision_pooling_optimized(
                                  dtype=hidden_states.dtype)
 
             # Check for vision tokens
-            has_vision = torch.isin(token_tensor,
-                                    self.vision_token_ids.to(device)).any()
+            has_vision = torch.any((token_tensor >= VISION_START_TOKEN_ID)
+                                   & (token_tensor <= VISION_END_TOKEN_ID))
 
             if has_vision:
                 # Use Triton kernel for vision token extraction
@@ -259,8 +255,8 @@ def _apply_vision_pooling_pytorch(
                                       device=hidden_states.device)
 
             # Check for vision tokens
-            vision_mask = torch.isin(
-                seq_tokens, self.vision_token_ids.to(seq_tokens.device))
+            vision_mask = ((seq_tokens >= VISION_START_TOKEN_ID) &
+                           (seq_tokens <= VISION_END_TOKEN_ID))
 
             if vision_mask.any():
                 # Pool only vision tokens