Fix pooling (classify or embed) model returns nan when dtype is float16 (or downcasted to float16) (HabanaAI#1441)

testdig · czhu15 · web-flow · commit dc0952360a61 · 2025-06-17T11:11:28.000+08:00
cc: @yangulei @ranzhejiang @czhu15 --------- Co-authored-by: Bob Zhu <bob.zhu@intel.com>
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
@@ -396,6 +396,8 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
             len_mask_v = len_mask.view(batch_size, 1, seq_len, 1)
             mask = attn_mask.logical_or(len_mask).logical_or(len_mask_v)
             off_value = -3E38  #small number, avoid nan and overflow
+            if dtype == torch.float16:
+                off_value = -63000    # a small value close to float16.min
         else:
             mask = attn_mask.logical_or(
                 len_mask)  #no need for len_mask_v as decode overwrites it