Hpu bucketing (#489)

kaixuanliu · web-flow · commit 53bdab358736 · 2025-03-10T17:17:06.000+01:00
Signed-off-by: kaixuanliu &lt;kaixuan.liu@intel.com&gt;
diff --git a/backends/python/server/text_embeddings_server/models/classification_model.py b/backends/python/server/text_embeddings_server/models/classification_model.py
@@ -26,6 +26,17 @@ def __init__(
         model = model.to(dtype).to(device)
 
         self.hidden_size = model.config.hidden_size
+        position_offset = 0
+        model_type = model.config.model_type
+        if model_type in ["xlm-roberta", "camembert", "roberta"]:
+            position_offset = model.config.pad_token_id + 1
+        if hasattr(model.config, "max_seq_length"):
+            self.max_input_length = model.config.max_seq_length
+        else:
+            self.max_input_length = (
+                model.config.max_position_embeddings - position_offset
+            )
+
         self.has_position_ids = (
             inspect.signature(model.forward).parameters.get("position_ids", None)
             is not None
diff --git a/backends/python/server/text_embeddings_server/models/default_model.py b/backends/python/server/text_embeddings_server/models/default_model.py
@@ -30,6 +30,17 @@ def __init__(
         self.hidden_size = model.config.hidden_size
         self.pooling = Pooling(self.hidden_size, pooling_mode=pool)
 
+        position_offset = 0
+        model_type = model.config.model_type
+        if model_type in ["xlm-roberta", "camembert", "roberta"]:
+            position_offset = model.config.pad_token_id + 1
+        if hasattr(model.config, "max_seq_length"):
+            self.max_input_length = model.config.max_seq_length
+        else:
+            self.max_input_length = (
+                model.config.max_position_embeddings - position_offset
+            )
+
         self.has_position_ids = (
             inspect.signature(model.forward).parameters.get("position_ids", None)
             is not None
diff --git a/backends/python/server/text_embeddings_server/models/flash_bert.py b/backends/python/server/text_embeddings_server/models/flash_bert.py
@@ -269,6 +269,12 @@ def forward(self, input_ids, token_type_ids, position_ids, cu_seqlens, max_s):
 class FlashBert(Model):
     def __init__(self, model_path: Path, device: torch.device, dtype: torch.dtype):
         config = BertConfig.from_pretrained(model_path)
+
+        if hasattr(config, "max_seq_length"):
+            self.max_input_length = config.max_seq_length
+        else:
+            self.max_input_length = config.max_position_embeddings
+
         with safe_open(model_path / "model.safetensors", framework="pt") as f:
             model = FlashBertModel(f, device, dtype, config)
         if device.type == "hpu":
diff --git a/backends/python/server/text_embeddings_server/models/types.py b/backends/python/server/text_embeddings_server/models/types.py
@@ -1,3 +1,4 @@
+import os
 import torch
 
 from abc import ABC, abstractmethod
@@ -8,6 +9,11 @@
 from text_embeddings_server.pb.embed_pb2 import Embedding, Score
 
 tracer = trace.get_tracer(__name__)
+PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get("PAD_SEQUENCE_TO_MULTIPLE_OF", 128))
+
+
+def round_up(number, k):
+    return (number + k - 1) // k * k
 
 
 class Batch(ABC):
@@ -30,11 +36,23 @@ class PaddedBatch(Batch):
 
     @classmethod
     @tracer.start_as_current_span("from_pb")
-    def from_pb(cls, pb: embed_pb2.EmbedRequest, device: torch.device) -> "PaddedBatch":
+    def from_pb(
+        cls, pb: embed_pb2.EmbedRequest, device: torch.device, max_input_length: int
+    ) -> "PaddedBatch":
+        if pb.max_length > max_input_length:
+            raise RuntimeError(f"input length exceeds model config's max_input_length")
+
+        batch_size = len(pb.cu_seq_lengths) - 1
+        if device.type == "hpu":
+            # To better utilize HPU, we need to do batch/seq_len bucketing
+            max_length = round_up(pb.max_length, PAD_SEQUENCE_TO_MULTIPLE_OF)
+            max_length = min(max_length, max_input_length)
+            new_bs = 2 ** math.ceil(math.log2(batch_size))
+        else:
+            new_bs = batch_size
+            max_length = pb.max_length
         # Allocate padded tensors all at once
-        all_tensors = torch.zeros(
-            [4, len(pb.cu_seq_lengths) - 1, pb.max_length], dtype=torch.int32
-        )
+        all_tensors = torch.zeros([4, new_bs, max_length], dtype=torch.int32)
 
         for i, start_index in enumerate(pb.cu_seq_lengths[:-1]):
             end_index = pb.cu_seq_lengths[i + 1]
@@ -77,7 +95,9 @@ class FlashBatch(Batch):
 
     @classmethod
     @tracer.start_as_current_span("from_pb")
-    def from_pb(cls, pb: embed_pb2.EmbedRequest, device: torch.device) -> "FlashBatch":
+    def from_pb(
+        cls, pb: embed_pb2.EmbedRequest, device: torch.device, max_input_length: int
+    ) -> "FlashBatch":
         batch_input_ids = torch.tensor(pb.input_ids, dtype=torch.int32, device=device)
         batch_token_type_ids = torch.tensor(
             pb.token_type_ids, dtype=torch.int32, device=device
diff --git a/backends/python/server/text_embeddings_server/server.py b/backends/python/server/text_embeddings_server/server.py
@@ -25,14 +25,20 @@ async def Health(self, request, context):
         return embed_pb2.HealthResponse()
 
     async def Embed(self, request, context):
-        batch = self.model.batch_type.from_pb(request, self.model.device)
+        max_input_length = self.model.max_input_length
+        batch = self.model.batch_type.from_pb(
+            request, self.model.device, max_input_length
+        )
 
         embeddings = self.model.embed(batch)
 
         return embed_pb2.EmbedResponse(embeddings=embeddings)
 
     async def Predict(self, request, context):
-        batch = self.model.batch_type.from_pb(request, self.model.device)
+        max_input_length = self.model.max_input_length
+        batch = self.model.batch_type.from_pb(
+            request, self.model.device, max_input_length
+        )
 
         scores = self.model.predict(batch)