fix: padding support in batch tokens (#93)

OlivierDehaene · web-flow · commit fc716d592794 · 2023-11-28T18:46:02.000+01:00
diff --git a/backends/candle/src/lib.rs b/backends/candle/src/lib.rs
@@ -126,6 +126,10 @@ impl Backend for CandleBackend {
         Ok(())
     }
 
+    fn is_padded(&self) -> bool {
+        self.model.is_padded()
+    }
+
     fn embed(&self, batch: Batch) -> Result<Vec<Embedding>, BackendError> {
         let results = self.model.embed(batch).e()?;
         let results = results.to_dtype(DType::F32).e()?.to_vec2().e()?;
diff --git a/backends/candle/src/models.rs b/backends/candle/src/models.rs
@@ -19,6 +19,8 @@ mod jina;
 pub use flash_bert::FlashBertModel;
 
 pub(crate) trait Model {
+    fn is_padded(&self) -> bool;
+
     fn embed(&self, _batch: Batch) -> Result<Tensor> {
         candle::bail!("`embed` is not implemented for this model");
     }
diff --git a/backends/candle/src/models/bert.rs b/backends/candle/src/models/bert.rs
@@ -618,6 +618,10 @@ impl BertModel {
 }
 
 impl Model for BertModel {
+    fn is_padded(&self) -> bool {
+        true
+    }
+
     fn embed(&self, batch: Batch) -> Result<Tensor> {
         self.forward(batch)
     }
diff --git a/backends/candle/src/models/flash_bert.rs b/backends/candle/src/models/flash_bert.rs
@@ -447,6 +447,9 @@ impl FlashBertModel {
 }
 
 impl Model for FlashBertModel {
+    fn is_padded(&self) -> bool {
+        false
+    }
     fn embed(&self, batch: Batch) -> Result<Tensor> {
         self.forward(batch)
     }
diff --git a/backends/candle/src/models/jina.rs b/backends/candle/src/models/jina.rs
@@ -595,6 +595,9 @@ impl JinaBertModel {
 }
 
 impl Model for JinaBertModel {
+    fn is_padded(&self) -> bool {
+        true
+    }
     fn embed(&self, batch: Batch) -> Result<Tensor> {
         self.forward(batch)
     }
diff --git a/backends/core/src/lib.rs b/backends/core/src/lib.rs
@@ -20,6 +20,8 @@ pub trait Backend {
         None
     }
 
+    fn is_padded(&self) -> bool;
+
     fn embed(&self, batch: Batch) -> Result<Vec<Embedding>, BackendError>;
 
     fn predict(&self, batch: Batch) -> Result<Vec<Vec<f32>>, BackendError>;
diff --git a/backends/python/src/lib.rs b/backends/python/src/lib.rs
@@ -65,6 +65,10 @@ impl Backend for PythonBackend {
         Ok(())
     }
 
+    fn is_padded(&self) -> bool {
+        false
+    }
+
     fn embed(&self, batch: Batch) -> Result<Vec<Embedding>, BackendError> {
         let results = self
             .tokio_runtime
diff --git a/backends/src/lib.rs b/backends/src/lib.rs
@@ -21,6 +21,7 @@ pub struct Backend {
     backend_sender: mpsc::UnboundedSender<BackendCommand>,
     /// Health status
     health_receiver: watch::Receiver<bool>,
+    pub padded_model: bool,
     pub max_batch_size: Option<usize>,
     pub model_type: ModelType,
 }
@@ -42,6 +43,7 @@ impl Backend {
             uds_path,
             otlp_endpoint,
         )?;
+        let padded_model = backend.is_padded();
         let max_batch_size = backend.max_batch_size();
 
         let (health_sender, health_receiver) = watch::channel(false);
@@ -53,6 +55,7 @@ impl Backend {
         Ok(Self {
             backend_sender,
             health_receiver,
+            padded_model,
             max_batch_size,
             model_type,
         })
diff --git a/core/src/queue.rs b/core/src/queue.rs
@@ -40,6 +40,7 @@ pub struct Queue {
 
 impl Queue {
     pub fn new(
+        padded_model: bool,
         max_batch_tokens: usize,
         max_batch_requests: Option<usize>,
         max_concurrent_requests: usize,
@@ -50,6 +51,7 @@ impl Queue {
         // Launch background queue task
         tokio::task::spawn_blocking(move || {
             queue_blocking_task(
+                padded_model,
                 max_batch_tokens,
                 max_batch_requests,
                 max_concurrent_requests,
@@ -93,6 +95,7 @@ impl Queue {
 
 // Background task responsible of the queue state
 fn queue_blocking_task(
+    padded_model: bool,
     max_batch_tokens: usize,
     max_batch_requests: Option<usize>,
     max_concurrent_requests: usize,
@@ -136,7 +139,14 @@ fn queue_blocking_task(
 
                     let entry_tokens = entry.encoding.input_ids.len();
 
-                    if current_tokens + entry_tokens > max_batch_tokens {
+                    let total_tokens = if padded_model {
+                        (max(max_length, entry_tokens as u32) * (metadata.len() + 1) as u32)
+                            as usize
+                    } else {
+                        current_tokens + entry_tokens
+                    };
+
+                    if total_tokens > max_batch_tokens {
                         entries.push_front(entry);
                         break;
                     }
diff --git a/router/src/main.rs b/router/src/main.rs
@@ -331,14 +331,18 @@ async fn main() -> Result<()> {
         .await
         .context("Model backend is not healthy")?;
 
-    let max_batch_requests = backend.max_batch_size.map(|s| {
-        tracing::warn!("Backend does not support a batch size > {s}");
-        tracing::warn!("forcing `max_batch_requests={s}`");
-        s
-    });
+    let max_batch_requests = backend
+        .max_batch_size
+        .map(|s| {
+            tracing::warn!("Backend does not support a batch size > {s}");
+            tracing::warn!("forcing `max_batch_requests={s}`");
+            s
+        })
+        .or(args.max_batch_requests);
 
     // Queue logic
     let queue = Queue::new(
+        backend.padded_model,
         args.max_batch_tokens,
         max_batch_requests,
         args.max_concurrent_requests,

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,8 @@ mod jina;`
`19`	`19`	`pub use flash_bert::FlashBertModel;`
`20`	`20`
`21`	`21`	`pub(crate) trait Model {`
	`22`	`+ fn is_padded(&self) -> bool;`
	`23`	`+`
`22`	`24`	`fn embed(&self, _batch: Batch) -> Result<Tensor> {`
`23`	`25`	candle::bail!("`embed` is not implemented for this model");
`24`	`26`	`}`
Original file line number	Diff line number	Diff line change
`@@ -618,6 +618,10 @@ impl BertModel {`
`618`	`618`	`}`
`619`	`619`
`620`	`620`	`impl Model for BertModel {`
	`621`	`+ fn is_padded(&self) -> bool {`
	`622`	`+ true`
	`623`	`+ }`
	`624`	`+`
`621`	`625`	`fn embed(&self, batch: Batch) -> Result<Tensor> {`
`622`	`626`	`self.forward(batch)`
`623`	`627`	`}`
Original file line number	Diff line number	Diff line change
`@@ -447,6 +447,9 @@ impl FlashBertModel {`
`447`	`447`	`}`
`448`	`448`
`449`	`449`	`impl Model for FlashBertModel {`
	`450`	`+ fn is_padded(&self) -> bool {`
	`451`	`+ false`
	`452`	`+ }`
`450`	`453`	`fn embed(&self, batch: Batch) -> Result<Tensor> {`
`451`	`454`	`self.forward(batch)`
`452`	`455`	`}`
Original file line number	Diff line number	Diff line change
`@@ -595,6 +595,9 @@ impl JinaBertModel {`
`595`	`595`	`}`
`596`	`596`
`597`	`597`	`impl Model for JinaBertModel {`
	`598`	`+ fn is_padded(&self) -> bool {`
	`599`	`+ true`
	`600`	`+ }`
`598`	`601`	`fn embed(&self, batch: Batch) -> Result<Tensor> {`
`599`	`602`	`self.forward(batch)`
`600`	`603`	`}`
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,8 @@ pub trait Backend {`
`20`	`20`	`None`
`21`	`21`	`}`
`22`	`22`
	`23`	`+ fn is_padded(&self) -> bool;`
	`24`	`+`
`23`	`25`	`fn embed(&self, batch: Batch) -> Result<Vec<Embedding>, BackendError>;`
`24`	`26`
`25`	`27`	`fn predict(&self, batch: Batch) -> Result<Vec<Vec<f32>>, BackendError>;`