feat: support roberta (#62)

kozistr · web-flow · commit 4048136210f4 · 2023-11-06T14:50:43.000+01:00
diff --git a/README.md b/README.md
@@ -53,8 +53,7 @@ such as:
 
 ### Supported Models
 
-You can use any JinaBERT model with Alibi or absolute positions or any BERT, CamemBERT or XLM-RoBERTa model with 
-absolute positions in `text-embeddings-inference`. 
+You can use any JinaBERT model with Alibi or absolute positions or any BERT, CamemBERT, RoBERTa, or XLM-RoBERTa model with absolute positions in `text-embeddings-inference`.
 
 **Support for other model types will be added in the future.**
 
@@ -96,8 +95,8 @@ curl 127.0.0.1:8080/embed \
     -H 'Content-Type: application/json'
 ```
 
-**Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). 
-We also recommend using NVIDIA drivers with CUDA version 12.0 or higher. 
+**Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
+We also recommend using NVIDIA drivers with CUDA version 12.0 or higher.
 
 To see all options to serve your models:
 
@@ -236,7 +235,7 @@ Text Embeddings Inference ships with multiple Docker images that you can use to
 | Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-0.3.0                    |
 | Hopper (H100)                       | ghcr.io/huggingface/text-embeddings-inference:hopper-0.3.0 (experimental) |
 
-**Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues. 
+**Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues.
 You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable.
 
 ### API documentation
@@ -329,7 +328,7 @@ cargo install --path router -F candle-cuda-turing --no-default-features
 cargo install --path router -F candle-cuda --no-default-features
 ```
 
-You can now launch Text Embeddings Inference on GPU with: 
+You can now launch Text Embeddings Inference on GPU with:
 
 ```shell
 model=BAAI/bge-large-en-v1.5
diff --git a/backends/candle/src/lib.rs b/backends/candle/src/lib.rs
@@ -41,6 +41,7 @@ impl CandleBackend {
         if config.model_type != Some("bert".to_string())
             && config.model_type != Some("xlm-roberta".to_string())
             && config.model_type != Some("camembert".to_string())
+            && config.model_type != Some("roberta".to_string())
         {
             return Err(BackendError::Start(format!(
                 "Model {:?} is not supported",
diff --git a/router/src/main.rs b/router/src/main.rs
@@ -215,12 +215,14 @@ async fn main() -> Result<()> {
     tokenizer.with_padding(None);
 
     // Position IDs offset. Used for Roberta and camembert.
-    let position_offset =
-        if &config.model_type == "xlm-roberta" || &config.model_type == "camembert" {
-            config.pad_token_id + 1
-        } else {
-            0
-        };
+    let position_offset = if &config.model_type == "xlm-roberta"
+        || &config.model_type == "camembert"
+        || &config.model_type == "roberta"
+    {
+        config.pad_token_id + 1
+    } else {
+        0
+    };
     let max_input_length = config.max_position_embeddings - position_offset;
 
     let tokenization_workers = args

Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,7 @@ impl CandleBackend {`
`41`	`41`	`if config.model_type != Some("bert".to_string())`
`42`	`42`	`&& config.model_type != Some("xlm-roberta".to_string())`
`43`	`43`	`&& config.model_type != Some("camembert".to_string())`
	`44`	`+ && config.model_type != Some("roberta".to_string())`
`44`	`45`	`{`
`45`	`46`	`return Err(BackendError::Start(format!(`
`46`	`47`	`"Model {:?} is not supported",`