huggingface
diff --git a/‎Cargo.lock
Lines changed: 9 additions & 0 deletions b/‎Cargo.lock
Lines changed: 9 additions & 0 deletions
diff --git a/‎backends/candle/Cargo.toml
Lines changed: 1 addition & 0 deletions b/‎backends/candle/Cargo.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/candle/src/lib.rs
Lines changed: 58 additions & 7 deletions b/‎backends/candle/src/lib.rs
Lines changed: 58 additions & 7 deletions
diff --git a/‎backends/candle/src/models.rs
Lines changed: 1 addition & 1 deletion b/‎backends/candle/src/models.rs
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/candle/src/models/bert.rs
Lines changed: 84 additions & 16 deletions b/‎backends/candle/src/models/bert.rs
Lines changed: 84 additions & 16 deletions
@@ -15,6 +15,7 @@ candle-flash-attn = { version = "^0.3", optional = true }
 candle-flash-attn-v1 = { git = "https://github.com/huggingface/candle-flash-attn-v1", rev = "d5b873e4555b7f460ed639d96f26cb014f2daad7", optional = true }
 candle-cublaslt = { git = "https://github.com/huggingface/candle-cublaslt", rev = "c8a810ffe649c5f4634cbe1f0aaf02f6025fe5a5", optional = true }
 candle-layer-norm = { git = "https://github.com/huggingface/candle-layer-norm", rev = "0dd5bdceb9ba7cded921c62f9ddd66e7726327ba", optional = true }
+nohash-hasher = "^0.2"
 text-embeddings-backend-core = { path = "../core" }
 tracing = "^0.1"
 safetensors = "^0.4"
 
@@ -18,8 +18,12 @@ use crate::models::{BertModel, JinaBertModel, Model, PositionEmbeddingType};
 use candle::{DType, Device};
 use candle_nn::VarBuilder;
 use models::Config;
+use nohash_hasher::BuildNoHashHasher;
+use std::collections::HashMap;
 use std::path::PathBuf;
-use text_embeddings_backend_core::{Backend, BackendError, Batch, Embedding, ModelType};
+use text_embeddings_backend_core::{
+    Backend, BackendError, Batch, Embedding, Embeddings, ModelType, Predictions,
+};
 
 pub struct CandleBackend {
     model: Box<dyn Model + Send>,
@@ -148,16 +152,63 @@ impl Backend for CandleBackend {
         self.model.is_padded()
     }
 
-    fn embed(&self, batch: Batch) -> Result<Vec<Embedding>, BackendError> {
-        let results = self.model.embed(batch).e()?;
-        let results = results.to_dtype(DType::F32).e()?.to_vec2().e()?;
-        Ok(results)
+    fn embed(&self, batch: Batch) -> Result<Embeddings, BackendError> {
+        let batch_size = batch.len();
+        let pooled_indices = batch.pooled_indices.clone();
+        let raw_indices = batch.raw_indices.clone();
+
+        // Used for indexing in the raw_embeddings tensor
+        let input_lengths: Vec<usize> = (0..batch.len())
+            .map(|i| {
+                (batch.cumulative_seq_lengths[i + 1] - batch.cumulative_seq_lengths[i]) as usize
+            })
+            .collect();
+
+        // Run forward
+        let (pooled_embeddings, raw_embeddings) = self.model.embed(batch).e()?;
+
+        // Device => Host data transfer
+        let pooled_embeddings = match pooled_embeddings {
+            None => vec![],
+            Some(pooled_embeddings) => pooled_embeddings.to_dtype(DType::F32).e()?.to_vec2().e()?,
+        };
+
+        // This transfer is expensive...
+        let raw_embeddings = match raw_embeddings {
+            None => vec![],
+            Some(raw_embeddings) => raw_embeddings.to_dtype(DType::F32).e()?.to_vec2().e()?,
+        };
+
+        let mut embeddings =
+            HashMap::with_capacity_and_hasher(batch_size, BuildNoHashHasher::default());
+        for (i, e) in pooled_indices.into_iter().zip(pooled_embeddings) {
+            embeddings.insert(i as usize, Embedding::Pooled(e));
+        }
+
+        let mut cumulative_length = 0;
+        for i in raw_indices.into_iter() {
+            let length = input_lengths[i as usize];
+            let e = raw_embeddings[cumulative_length..cumulative_length + length].to_vec();
+            embeddings.insert(i as usize, Embedding::All(e));
+            cumulative_length += length;
+        }
+
+        Ok(embeddings)
     }
 
-    fn predict(&self, batch: Batch) -> Result<Vec<Vec<f32>>, BackendError> {
+    fn predict(&self, batch: Batch) -> Result<Predictions, BackendError> {
+        let batch_size = batch.len();
+
         let results = self.model.predict(batch).e()?;
         let results = results.to_dtype(DType::F32).e()?.to_vec2().e()?;
-        Ok(results)
+
+        let mut predictions =
+            HashMap::with_capacity_and_hasher(batch_size, BuildNoHashHasher::default());
+        for (i, r) in results.into_iter().enumerate() {
+            predictions.insert(i, r);
+        }
+
+        Ok(predictions)
     }
 }
 
 
@@ -27,7 +27,7 @@ pub use flash_jina::FlashJinaBertModel;
 pub(crate) trait Model {
     fn is_padded(&self) -> bool;
 
-    fn embed(&self, _batch: Batch) -> Result<Tensor> {
+    fn embed(&self, _batch: Batch) -> Result<(Option<Tensor>, Option<Tensor>)> {
         candle::bail!("`embed` is not implemented for this model");
     }
 
 
@@ -515,10 +515,10 @@ impl BertModel {
         })
     }
 
-    pub fn forward(&self, batch: Batch) -> Result<Tensor> {
+    pub fn forward(&self, batch: Batch) -> Result<(Option<Tensor>, Option<Tensor>)> {
         let _enter = self.span.enter();
 
-        let batch_size = batch.cumulative_seq_lengths.len() - 1;
+        let batch_size = batch.len();
         let max_length = batch.max_length as usize;
 
         let shape = (batch_size, max_length);
@@ -634,25 +634,91 @@ impl BertModel {
             .embeddings
             .forward(&input_ids, &type_ids, &position_ids)?;
 
-        let mut outputs = self
+        let outputs = self
             .encoder
             .forward(&embedding_output, attention_bias.as_ref())?;
 
-        let results = match self.pool {
-            // CLS pooling
-            Pool::Cls => outputs.i((.., 0))?,
-            // Mean pooling
-            Pool::Mean => {
-                if let Some(attention_mask) = attention_mask {
-                    // Mask padded values
-                    outputs = outputs.broadcast_mul(&attention_mask)?;
+        let has_pooling_requests = !batch.pooled_indices.is_empty();
+        let has_raw_requests = !batch.raw_indices.is_empty();
+
+        let pooled_embeddings = if has_pooling_requests {
+            let pooled_indices_length = batch.pooled_indices.len();
+            let mut outputs = outputs.clone();
+
+            // Only use pooled_indices if at least one member of the batch ask for raw embeddings
+            let pooled_indices = if has_raw_requests {
+                let pooled_indices =
+                    Tensor::from_vec(batch.pooled_indices, pooled_indices_length, &self.device)?;
+
+                // Select values in the batch
+                outputs = outputs.index_select(&pooled_indices, 0)?;
+                Some(pooled_indices)
+            } else {
+                None
+            };
+
+            let pooled_embeddings = match self.pool {
+                // CLS pooling
+                Pool::Cls => outputs.i((.., 0))?,
+                // Mean pooling
+                Pool::Mean => {
+                    if let Some(ref attention_mask) = attention_mask {
+                        let mut attention_mask = attention_mask.clone();
+
+                        if let Some(pooled_indices) = pooled_indices {
+                            // Select values in the batch
+                            attention_mask = attention_mask.index_select(&pooled_indices, 0)?;
+                        };
+
+                        // Mask padded values
+                        outputs = outputs.broadcast_mul(&attention_mask)?;
+                    }
+
+                    (outputs.sum(1)?.broadcast_div(&input_lengths))?
                 }
+            };
+            Some(pooled_embeddings)
+        } else {
+            None
+        };
 
-                (outputs.sum(1)?.broadcast_div(&input_lengths))?
+        let raw_embeddings = if has_raw_requests {
+            // Reshape outputs
+            let (b, l, h) = outputs.shape().dims3()?;
+            let outputs = outputs.reshape((b * l, h))?;
+
+            // We need to remove the padding tokens only if batch_size > 1 and there are some
+            // member of the batch that require pooling
+            // or if batch_size > 1 and the members of the batch have different lengths
+            if (attention_mask.is_some() || has_pooling_requests) && batch_size > 1 {
+                let mut final_indices: Vec<u32> = Vec::with_capacity(batch_size * max_length);
+
+                for i in batch.raw_indices.into_iter() {
+                    let start = i * batch.max_length;
+                    let i = i as usize;
+                    let length =
+                        batch.cumulative_seq_lengths[i + 1] - batch.cumulative_seq_lengths[i];
+
+                    for j in start..start + length {
+                        // Add indices for the tokens of this specific member of the batch
+                        final_indices.push(j);
+                    }
+                }
+
+                let final_indices_length = final_indices.len();
+                let final_indices =
+                    Tensor::from_vec(final_indices, final_indices_length, &self.device)?;
+
+                // Select the tokens with final indices
+                Some(outputs.index_select(&final_indices, 0)?)
+            } else {
+                Some(outputs)
             }
+        } else {
+            None
         };
 
-        Ok(results)
+        Ok((pooled_embeddings, raw_embeddings))
     }
 }
 
@@ -661,16 +727,18 @@ impl Model for BertModel {
         true
     }
 
-    fn embed(&self, batch: Batch) -> Result<Tensor> {
+    fn embed(&self, batch: Batch) -> Result<(Option<Tensor>, Option<Tensor>)> {
         self.forward(batch)
     }
 
     fn predict(&self, batch: Batch) -> Result<Tensor> {
         match &self.classifier {
             None => candle::bail!("`predict` is not implemented for this model"),
             Some(classifier) => {
-                let hidden_states = self.forward(batch)?;
-                classifier.forward(&hidden_states)
+                let (pooled_embeddings, _raw_embeddings) = self.forward(batch)?;
+                let pooled_embeddings =
+                    pooled_embeddings.expect("pooled_embeddings is empty. This is a bug.");
+                classifier.forward(&pooled_embeddings)
             }
         }
     }
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ pub use flash_jina::FlashJinaBertModel;`
`27`	`27`	`pub(crate) trait Model {`
`28`	`28`	`fn is_padded(&self) -> bool;`
`29`	`29`
`30`		`- fn embed(&self, _batch: Batch) -> Result<Tensor> {`
	`30`	`+ fn embed(&self, _batch: Batch) -> Result<(Option<Tensor>, Option<Tensor>)> {`
`31`	`31`	candle::bail!("`embed` is not implemented for this model");
`32`	`32`	`}`
`33`	`33`