huggingface
diff --git a/‎Cargo.lock
Lines changed: 11 additions & 10 deletions b/‎Cargo.lock
Lines changed: 11 additions & 10 deletions
diff --git a/‎README.md
Lines changed: 27 additions & 7 deletions b/‎README.md
Lines changed: 27 additions & 7 deletions
diff --git a/‎backends/Cargo.toml
Lines changed: 1 addition & 1 deletion b/‎backends/Cargo.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/candle/src/lib.rs
Lines changed: 16 additions & 27 deletions b/‎backends/candle/src/lib.rs
Lines changed: 16 additions & 27 deletions
diff --git a/‎backends/candle/src/models.rs
Lines changed: 1 addition & 1 deletion b/‎backends/candle/src/models.rs
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/candle/src/models/bert.rs
Lines changed: 19 additions & 43 deletions b/‎backends/candle/src/models/bert.rs
Lines changed: 19 additions & 43 deletions
@@ -102,18 +102,22 @@ Usage: text-embeddings-router [OPTIONS]
 
 Options:
       --model-id <MODEL_ID>
-          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `thenlper/gte-base`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
+          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `thenlper/gte-base`. 
+          Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of 
+          transformers
 
           [env: MODEL_ID=]
           [default: thenlper/gte-base]
 
       --revision <REVISION>
-          The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
+          The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id 
+          or a branch like `refs/pr/2`
 
           [env: REVISION=]
 
       --tokenization-workers <TOKENIZATION_WORKERS>
-          Optionally control the number of tokenizer workers used for payload tokenization, validation and truncation. Default to the number of CPU cores on the machine
+          Optionally control the number of tokenizer workers used for payload tokenization, validation and truncation. 
+          Default to the number of CPU cores on the machine
 
           [env: TOKENIZATION_WORKERS=]
 
@@ -124,8 +128,21 @@ Options:
           [default: float16]
           [possible values: float16, float32]
 
+      --pooling <POOLING>
+          Optionally control the pooling method. 
+          
+          If `pooling` is not set, the pooling configuration will be parsed from the model `1_Pooling/config.json`
+          configuration. 
+          
+          If `pooling` is set, it will override the model pooling configuration
+          
+          [env: POOLING=]
+          [possible values: cls, mean]
+
       --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
-          The maximum amount of concurrent requests for this particular deployment. Having a low limit will refuse clients requests instead of having them wait for too long and is usually good to handle backpressure correctly
+          The maximum amount of concurrent requests for this particular deployment. 
+          Having a low limit will refuse clients requests instead of having them wait for too long and is usually good 
+          to handle backpressure correctly
 
           [env: MAX_CONCURRENT_REQUESTS=]
           [default: 512]
@@ -137,7 +154,8 @@ Options:
 
           For `max_batch_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
 
-          Overall this number should be the largest possible until the model is compute bound. Since the actual memory overhead depends on the model implementation, text-embeddings-inference cannot infer this number automatically.
+          Overall this number should be the largest possible until the model is compute bound. Since the actual memory 
+          overhead depends on the model implementation, text-embeddings-inference cannot infer this number automatically.
 
           [env: MAX_BATCH_TOKENS=]
           [default: 16384]
@@ -171,13 +189,15 @@ Options:
           [default: 3000]
 
       --uds-path <UDS_PATH>
-          The name of the unix socket some text-embeddings-inference backends will use as they communicate internally with gRPC
+          The name of the unix socket some text-embeddings-inference backends will use as they communicate internally 
+          with gRPC
 
           [env: UDS_PATH=]
           [default: /tmp/text-embeddings-inference-server]
 
       --huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>
-          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
+          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk 
+          for instance
 
           [env: HUGGINGFACE_HUB_CACHE=/data]
 
 
@@ -15,7 +15,7 @@ tokio = { version = "^1.25", features = ["sync"] }
 tracing = "^0.1"
 
 [features]
-clap = [ "dep:clap" ]
+clap = ["dep:clap", "text-embeddings-backend-core/clap"]
 python = ["dep:text-embeddings-backend-python"]
 candle = ["dep:text-embeddings-backend-candle"]
 mkl = ["text-embeddings-backend-candle?/mkl"]
 
@@ -9,29 +9,23 @@ use crate::compute_cap::{incompatible_compute_cap, COMPILE_COMPUTE_CAP, RUNTIME_
 use crate::models::{BertModel, EmbeddingModel, QuantBertModel};
 use candle::{DType, Device};
 use candle_nn::VarBuilder;
-use models::{Config, PoolConfig};
+use models::Config;
 use std::path::PathBuf;
-use text_embeddings_backend_core::{BackendError, Batch, Embedding, EmbeddingBackend};
+use text_embeddings_backend_core::{BackendError, Batch, Embedding, EmbeddingBackend, Pool};
 
 pub struct CandleBackend {
     model: Box<dyn EmbeddingModel + Send>,
     device: Device,
 }
 
 impl CandleBackend {
-    pub fn new(model_path: PathBuf, dtype: String) -> Result<Self, BackendError> {
+    pub fn new(model_path: PathBuf, dtype: String, pool: Pool) -> Result<Self, BackendError> {
         // Load config
         let config: String = std::fs::read_to_string(model_path.join("config.json"))
             .map_err(|err| BackendError::Start(err.to_string()))?;
         let config: Config =
             serde_json::from_str(&config).map_err(|err| BackendError::Start(err.to_string()))?;
 
-        // Load pooling config
-        let pool_config: String = std::fs::read_to_string(model_path.join("1_Pooling/config.json"))
-            .map_err(|err| BackendError::Start(err.to_string()))?;
-        let pool_config: PoolConfig = serde_json::from_str(&pool_config)
-            .map_err(|err| BackendError::Start(err.to_string()))?;
-
         // Get candle device
         let device = match Device::cuda_if_available(0) {
             Ok(device) => device,
@@ -71,23 +65,17 @@ impl CandleBackend {
                     } else {
                         VarBuilder::from_pth(model_path.join("pytorch_model.bin"), dtype, &device)
                     }
-                    .map_err(|err| BackendError::Start(err.to_string()))?;
+                    .s()?;
 
-                    Box::new(
-                        BertModel::load(vb, &config, pool_config.into())
-                            .map_err(|err| BackendError::Start(err.to_string()))?,
-                    )
+                    Box::new(BertModel::load(vb, &config, pool).s()?)
                 } else if &dtype == "q6k" {
                     let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(
                         model_path.join("ggml-model-q6k.bin"),
                     )
                     .map_err(|err| BackendError::Start(err.to_string()))?;
                     tracing::info!("vb");
 
-                    Box::new(
-                        QuantBertModel::load(vb, &config, pool_config.into())
-                            .map_err(|err| BackendError::Start(err.to_string()))?,
-                    )
+                    Box::new(QuantBertModel::load(vb, &config, pool).s()?)
                 } else {
                     return Err(BackendError::Start(format!(
                         "dtype {dtype} is not supported"
@@ -126,17 +114,14 @@ impl CandleBackend {
                     } else {
                         VarBuilder::from_pth(model_path.join("pytorch_model.bin"), dtype, &device)
                     }
-                    .map_err(|err| BackendError::Start(err.to_string()))?;
+                    .s()?;
 
                     if incompatible_compute_cap() {
                         return Err(BackendError::Start(format!("Runtime compute cap {} is not compatible with compile time compute cap {}", *RUNTIME_COMPUTE_CAP, *COMPILE_COMPUTE_CAP)));
                     }
 
                     tracing::info!("Starting FlashBert model on Cuda");
-                    Box::new(
-                        FlashBertModel::load(vb, &config, pool_config.into())
-                            .map_err(|err| BackendError::Start(err.to_string()))?,
-                    )
+                    Box::new(FlashBertModel::load(vb, &config, pool).s()?)
                 }
             }
         };
@@ -151,8 +136,8 @@ impl EmbeddingBackend for CandleBackend {
     }
 
     fn embed(&self, batch: Batch) -> Result<Vec<Embedding>, BackendError> {
-        let results = self.model.embed(batch).w()?;
-        let results = results.to_dtype(DType::F32).w()?.to_vec2().w()?;
+        let results = self.model.embed(batch).e()?;
+        let results = results.to_dtype(DType::F32).e()?.to_vec2().e()?;
         Ok(results)
     }
 
@@ -165,11 +150,15 @@ impl EmbeddingBackend for CandleBackend {
 }
 
 pub trait WrapErr<O> {
-    fn w(self) -> Result<O, BackendError>;
+    fn s(self) -> Result<O, BackendError>;
+    fn e(self) -> Result<O, BackendError>;
 }
 
 impl<O> WrapErr<O> for Result<O, candle::Error> {
-    fn w(self) -> Result<O, BackendError> {
+    fn s(self) -> Result<O, BackendError> {
+        self.map_err(|e| BackendError::Start(e.to_string()))
+    }
+    fn e(self) -> Result<O, BackendError> {
         self.map_err(|e| BackendError::Inference(e.to_string()))
     }
 }
@@ -7,7 +7,7 @@ extern crate accelerate_src;
 mod bert;
 mod bert_quant;
 
-pub use bert::{BertModel, Config, PoolConfig};
+pub use bert::{BertModel, Config};
 pub use bert_quant::QuantBertModel;
 use candle::{Result, Tensor};
 use text_embeddings_backend_core::Batch;
 
@@ -4,7 +4,7 @@ use candle_nn::ops::softmax;
 use candle_nn::{Embedding, VarBuilder};
 use serde::Deserialize;
 use std::collections::HashMap;
-use text_embeddings_backend_core::Batch;
+use text_embeddings_backend_core::{Batch, Pool};
 
 // https://github.com/huggingface/transformers/blob/6eedfa6dd15dc1e22a55ae036f681914e5a0d9a1/src/transformers/models/bert/configuration_bert.py#L1
 #[derive(Debug, Clone, PartialEq, Deserialize)]
@@ -37,43 +37,13 @@ pub enum HiddenAct {
     Relu,
 }
 
-#[derive(Debug, PartialEq)]
-pub enum Pool {
-    Cls,
-    Mean,
-    Max,
-    MeanSqrt,
-}
-
-impl From<PoolConfig> for Pool {
-    fn from(value: PoolConfig) -> Self {
-        if value.pooling_mode_cls_token {
-            Pool::Cls
-        } else if value.pooling_mode_mean_tokens {
-            Pool::Mean
-        } else if value.pooling_mode_max_tokens {
-            Pool::Max
-        } else {
-            Pool::MeanSqrt
-        }
-    }
-}
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Default)]
 #[serde(rename_all = "lowercase")]
 pub enum PositionEmbeddingType {
     #[default]
     Absolute,
 }
 
-#[derive(Debug, Clone, PartialEq, Deserialize)]
-pub struct PoolConfig {
-    pooling_mode_cls_token: bool,
-    pooling_mode_mean_tokens: bool,
-    pooling_mode_max_tokens: bool,
-    pooling_mode_mean_sqrt_len_tokens: bool,
-}
-
 #[derive(Debug)]
 struct LayerNorm {
     weight: Tensor,
@@ -85,8 +55,12 @@ struct LayerNorm {
 impl LayerNorm {
     pub fn load(vb: VarBuilder, config: &Config) -> Result<Self> {
         Ok(Self {
-            weight: vb.get(config.hidden_size, "weight")?,
-            bias: vb.get(config.hidden_size, "bias")?,
+            weight: vb
+                .get(config.hidden_size, "weight")
+                .or_else(|_| vb.get(config.hidden_size, "gamma"))?,
+            bias: vb
+                .get(config.hidden_size, "bias")
+                .or_else(|_| vb.get(config.hidden_size, "beta"))?,
             epsilon: config.layer_norm_eps,
             span: tracing::span!(tracing::Level::TRACE, "layer-norm"),
         })
@@ -435,15 +409,18 @@ impl BertModel {
         ) {
             (Ok(embeddings), Ok(encoder)) => (embeddings, encoder),
             (Err(err), _) | (_, Err(err)) => {
-                if let Some(model_type) = &config.model_type {
-                    if let (Ok(embeddings), Ok(encoder)) = (
-                        BertEmbeddings::load(vb.pp(format!("{model_type}.embeddings")), config),
-                        BertEncoder::load(vb.pp(format!("{model_type}.encoder")), config),
-                    ) {
-                        (embeddings, encoder)
-                    } else {
-                        return Err(err);
-                    }
+                let model_type = config.model_type.clone().unwrap_or("bert".to_string());
+
+                if let (Ok(embeddings), Ok(encoder)) = (
+                    BertEmbeddings::load(vb.pp(format!("{model_type}.embeddings")), config),
+                    BertEncoder::load(vb.pp(format!("{model_type}.encoder")), config),
+                ) {
+                    (embeddings, encoder)
+                } else if let (Ok(embeddings), Ok(encoder)) = (
+                    BertEmbeddings::load(vb.pp("bert.embeddings"), config),
+                    BertEncoder::load(vb.pp("bert.encoder"), config),
+                ) {
+                    (embeddings, encoder)
                 } else {
                     return Err(err);
                 }
@@ -484,7 +461,6 @@ impl BertModel {
             Pool::Cls => outputs.i(0..1)?,
             // Mean pooling
             Pool::Mean => (outputs.sum_keepdim(0)? / (batch.max_length as f64))?,
-            _ => candle::bail!("Pool type {:?} is not supported", self.pool),
         };
 
         // Normalize