huggingface
diff --git a/‎backends/candle/src/alibi.rs
Lines changed: 75 additions & 0 deletions b/‎backends/candle/src/alibi.rs
Lines changed: 75 additions & 0 deletions
diff --git a/‎backends/candle/src/lib.rs
Lines changed: 15 additions & 4 deletions b/‎backends/candle/src/lib.rs
Lines changed: 15 additions & 4 deletions
diff --git a/‎backends/candle/src/models.rs
Lines changed: 3 additions & 0 deletions b/‎backends/candle/src/models.rs
Lines changed: 3 additions & 0 deletions
@@ -0,0 +1,75 @@
+// coding=utf-8
+// Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+// Copyright (c) 2023 Jina AI GmbH. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+use candle::{DType, Device, Result, Tensor};
+
+fn get_slopes_power_of_2(n: usize) -> Vec<f64> {
+    let start: f64 = 2_f64.powf(-2_f64.powf(-((n as f64).log2() - 3_f64)));
+
+    (0..n).map(|i| start * start.powi(i as i32)).collect()
+}
+
+fn alibi_head_slopes(num_attention_heads: usize) -> Vec<f64> {
+    if (num_attention_heads as f64).log2().fract() == 0.0 {
+        // `num_attention_heads` is a power of 2
+        get_slopes_power_of_2(num_attention_heads)
+    } else {
+        let closest_power_of_2 =
+            2_f64.powi((num_attention_heads as f64).log2().floor() as i32) as usize;
+
+        let mut slopes = get_slopes_power_of_2(closest_power_of_2);
+        let additional_slopes: Vec<f64> = get_slopes_power_of_2(2 * closest_power_of_2)
+            .into_iter()
+            .enumerate()
+            // Filter odd indices
+            .filter(|(i, _)| i % 2 == 0)
+            // Remove i
+            .map(|(_, v)| v)
+            .collect();
+
+        // Extend slopes
+        slopes.extend_from_slice(&additional_slopes[0..(num_attention_heads - closest_power_of_2)]);
+
+        slopes
+    }
+}
+
+pub fn build_alibi_tensor(
+    num_positions: usize,
+    num_heads: usize,
+    device: &Device,
+    dtype: DType,
+) -> Result<Tensor> {
+    let context_positions = Tensor::arange(0.0, num_positions as f64, device)?.unsqueeze(1)?;
+    let memory_positions = Tensor::arange(0.0, num_positions as f64, device)?.unsqueeze(0)?;
+
+    let relative_positions = memory_positions.broadcast_sub(&context_positions)?.abs()?;
+    // [num_heads, num_positions, num_positions]
+    let relative_positions =
+        relative_positions
+            .unsqueeze(0)?
+            .expand((num_heads, num_positions, num_positions))?;
+
+    // [num_heads, 1, 1]
+    let slopes =
+        (Tensor::from_vec(alibi_head_slopes(num_heads), (num_heads, 1, 1), device)? * -1_f64)?;
+
+    // [num_heads, num_positions, num_positions]
+    let alibi = relative_positions.broadcast_mul(&slopes)?;
+
+    alibi
+        .reshape((1, num_heads, num_positions, num_positions))?
+        .to_dtype(dtype)
+}
@@ -1,3 +1,4 @@
+mod alibi;
 #[cfg(feature = "cuda")]
 mod compute_cap;
 #[cfg(feature = "cuda")]
@@ -9,7 +10,9 @@ mod models;
 use crate::compute_cap::{incompatible_compute_cap, COMPILE_COMPUTE_CAP, RUNTIME_COMPUTE_CAP};
 #[cfg(feature = "cuda")]
 use crate::models::FlashBertModel;
-use crate::models::{BertModel, EmbeddingModel, PositionEmbeddingType, QuantBertModel};
+use crate::models::{
+    BertModel, EmbeddingModel, JinaBertModel, PositionEmbeddingType, QuantBertModel,
+};
 use candle::{DType, Device};
 use candle_nn::VarBuilder;
 use models::Config;
@@ -47,8 +50,6 @@ impl CandleBackend {
 
         let model: Box<dyn EmbeddingModel + Send> = match device {
             Device::Cpu => {
-                tracing::info!("Starting Bert model on CPU");
-
                 if &dtype == "float32" || &dtype == "float16" {
                     let dtype = if &dtype == "float32" {
                         DType::F32
@@ -70,14 +71,21 @@ impl CandleBackend {
                     }
                     .s()?;
 
-                    Box::new(BertModel::load(vb, &config, pool).s()?)
+                    if config.position_embedding_type == PositionEmbeddingType::Alibi {
+                        tracing::info!("Starting JinaBert model on CPU");
+                        Box::new(JinaBertModel::load(vb, &config, pool).s()?)
+                    } else {
+                        tracing::info!("Starting Bert model on CPU");
+                        Box::new(BertModel::load(vb, &config, pool).s()?)
+                    }
                 } else if &dtype == "q6k" {
                     let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(
                         model_path.join("ggml-model-q6k.bin"),
                     )
                     .map_err(|err| BackendError::Start(err.to_string()))?;
                     tracing::info!("vb");
 
+                    tracing::info!("Starting QuantBert model on CPU");
                     Box::new(QuantBertModel::load(vb, &config, pool).s()?)
                 } else {
                     return Err(BackendError::Start(format!(
@@ -130,6 +138,9 @@ impl CandleBackend {
                     {
                         tracing::info!("Starting FlashBert model on Cuda");
                         Box::new(FlashBertModel::load(vb, &config, pool).s()?)
+                    } else if config.position_embedding_type == PositionEmbeddingType::Alibi {
+                        tracing::info!("Starting JinaBert model on Cuda");
+                        Box::new(JinaBertModel::load(vb, &config, pool).s()?)
                     } else {
                         tracing::info!("Starting Bert model on Cuda");
                         Box::new(BertModel::load(vb, &config, pool).s()?)
 
@@ -10,10 +10,13 @@ mod bert_quant;
 pub use bert::{BertModel, Config, PositionEmbeddingType};
 pub use bert_quant::QuantBertModel;
 use candle::{Result, Tensor};
+pub use jina::JinaBertModel;
 use text_embeddings_backend_core::Batch;
 
 #[cfg(feature = "cuda")]
 mod flash_bert;
+mod jina;
+
 #[cfg(feature = "cuda")]
 pub use flash_bert::FlashBertModel;