Optimize the performance of FlashBert on HPU by using fast mode softmax (#555)

kaixuanliu · web-flow · commit b06b752b13ec · 2025-04-03T17:07:28.000+02:00
Signed-off-by: Liu, Kaixuan &lt;kaixuan.liu@intel.com&gt;
diff --git a/backends/python/server/text_embeddings_server/models/flash_bert.py b/backends/python/server/text_embeddings_server/models/flash_bert.py
@@ -305,6 +305,7 @@ def __init__(self, model_path: Path, device: torch.device, dtype: torch.dtype):
         with safe_open(model_path / "model.safetensors", framework="pt") as f:
             model = FlashBertModel(f, device, dtype, config)
         self.device = device
+        self.dtype = dtype
         if device.type == "hpu":
             from habana_frameworks.torch.hpu import wrap_in_hpu_graph
 
@@ -326,12 +327,15 @@ def embed(self, batch: Union[FlashBatch, PaddedBatch]) -> List[Embedding]:
             cu_seqlens = torch.cat(
                 (input_lens.new_tensor([0]), input_lens.cumsum(-1).int())
             )
-            mask = batch.attention_mask.to(torch.bool)
+            mask = batch.attention_mask.bool()
             batch_size = input_lens.size(0)
-            attn_mask = torch.empty(
-                [batch_size, 1, 1, mask.shape[-1]], device=self.device
-            ).fill_(float("-inf"))
-            attn_mask[:, :, :, :].masked_fill_(mask[:, None, None, :], 0)
+            attn_mask = torch.full(
+                [batch_size, 1, 1, mask.shape[-1]],
+                fill_value=torch.finfo(self.dtype).min,
+                device=self.device,
+                dtype=self.dtype,
+            )
+            attn_mask.masked_fill_(mask[:, None, None, :], 0)
         elif isinstance(batch, FlashBatch):
             cu_seqlens = batch.cu_seqlens
             mask = None
diff --git a/backends/python/server/text_embeddings_server/utils/flash_attn.py b/backends/python/server/text_embeddings_server/utils/flash_attn.py
@@ -78,7 +78,7 @@ def hpu_attn(
     if is_causal:
         attn_mask = None
 
-    out_ = FusedSDPA.apply(q, k, v, attn_mask, 0.0, is_causal, softmax_scale)
+    out_ = FusedSDPA.apply(q, k, v, attn_mask, 0.0, is_causal, softmax_scale, "fast", False)
     out_ = out_.transpose(1, 2)
     out.copy_(out_)
     return out