feat: rerank route (#84)

OlivierDehaene · web-flow · commit 8e85e9c91a65 · 2023-11-20T16:07:36.000+01:00
diff --git a/README.md b/README.md
@@ -33,6 +33,7 @@ length of 512 tokens:
     - [Docker Images](#docker-images)
     - [API Documentation](#api-documentation)
     - [Using a private or gated model](#using-a-private-or-gated-model)
+    - [Using Re-rankers models](#using-re-rankers-models)
     - [Using Sequence Classification models](#using-sequence-classification-models)
     - [Distributed Tracing](#distributed-tracing)
 - [Local Install](#local-install)
@@ -281,11 +282,14 @@ token=<your cli READ token>
 docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model
 ```
 
-### Using Sequence Classification models
+### Using Re-rankers models
 
 `text-embeddings-inference` v0.4.0 added support for CamemBERT, RoBERTa and XLM-RoBERTa Sequence Classification models.
+Re-rankers models are Sequence Classification cross-encoders models with a single class that scores the similarity
+between a query and a passage.
+
 See [this blogpost](https://blog.llamaindex.ai/boosting-rag-picking-the-best-embedding-reranker-models-42d079022e83) by
-the LlamaIndex team to understand how you can use Sequence Classification models in your RAG pipeline to improve
+the LlamaIndex team to understand how you can use re-rankers models in your RAG pipeline to improve
 downstream performance.
 
 ```shell
@@ -296,15 +300,17 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model --revision $revision
 ```
 
-And then you can rank the similarity between a pair of inputs with:
+And then you can rank the similarity between a query and a list of passages with:
 
 ```bash
-curl 127.0.0.1:8080/predict \
+curl 127.0.0.1:8080/rerank \
     -X POST \
-    -d '{"inputs":["What is Deep Learning?", "Deep learning is..."], "raw_scores": true}' \
+    -d '{"query":"What is Deep Learning?", "passages": ["Deep Learning is not...", "Deep learning is..."]}' \
     -H 'Content-Type: application/json'
 ```
 
+### Using Sequence Classification models
+
 You can also use classic Sequence Classification models like `SamLowe/roberta-base-go_emotions`:
 
 ```shell
diff --git a/backends/core/src/lib.rs b/backends/core/src/lib.rs
@@ -53,7 +53,7 @@ pub enum BackendError {
     NoBackend,
     #[error("Could not start backend: {0}")]
     Start(String),
-    #[error("Inference error: {0}")]
+    #[error("{0}")]
     Inference(String),
     #[error("Backend is unhealthy")]
     Unhealthy,
diff --git a/backends/src/lib.rs b/backends/src/lib.rs
@@ -3,6 +3,7 @@ mod dtype;
 use std::path::PathBuf;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
+use std::time::{Duration, Instant};
 use text_embeddings_backend_core::Backend as CoreBackend;
 use tokio::sync::oneshot;
 use tracing::{instrument, Span};
@@ -91,7 +92,7 @@ impl Backend {
     }
 
     #[instrument(skip_all)]
-    pub async fn embed(&self, batch: Batch) -> Result<Vec<Embedding>, BackendError> {
+    pub async fn embed(&self, batch: Batch) -> Result<(Vec<Embedding>, Duration), BackendError> {
         let (sender, receiver) = oneshot::channel();
 
         self.backend_sender
@@ -107,7 +108,7 @@ impl Backend {
     }
 
     #[instrument(skip_all)]
-    pub async fn predict(&self, batch: Batch) -> Result<Vec<Vec<f32>>, BackendError> {
+    pub async fn predict(&self, batch: Batch) -> Result<(Vec<Vec<f32>>, Duration), BackendError> {
         let (sender, receiver) = oneshot::channel();
 
         self.backend_sender
@@ -166,18 +167,19 @@ fn backend_blocking_task(
     command_receiver: flume::Receiver<BackendCommand>,
 ) {
     while let Ok(cmd) = command_receiver.recv() {
+        let start = Instant::now();
         match cmd {
             BackendCommand::Health(span, sender) => {
                 let _span = span.entered();
                 let _ = sender.send(backend.health());
             }
             BackendCommand::Embed(batch, span, sender) => {
                 let _span = span.entered();
-                let _ = sender.send(backend.embed(batch));
+                let _ = sender.send(backend.embed(batch).map(|e| (e, start.elapsed())));
             }
             BackendCommand::Predict(batch, span, sender) => {
                 let _span = span.entered();
-                let _ = sender.send(backend.predict(batch));
+                let _ = sender.send(backend.predict(batch).map(|e| (e, start.elapsed())));
             }
         }
     }
@@ -188,11 +190,12 @@ enum BackendCommand {
     Embed(
         Batch,
         Span,
-        oneshot::Sender<Result<Vec<Embedding>, BackendError>>,
+        oneshot::Sender<Result<(Vec<Embedding>, Duration), BackendError>>,
     ),
     Predict(
         Batch,
         Span,
-        oneshot::Sender<Result<Vec<Vec<f32>>, BackendError>>,
+        #[allow(clippy::type_complexity)]
+        oneshot::Sender<Result<(Vec<Vec<f32>>, Duration), BackendError>>,
     ),
 }
diff --git a/core/src/infer.rs b/core/src/infer.rs
@@ -90,7 +90,7 @@ impl Infer {
     ) -> Result<InferResponse, TextEmbeddingsError> {
         if self.is_classifier() {
             metrics::increment_counter!("te_request_failure", "err" => "model_type");
-            let message = "model is not an embedding model".to_string();
+            let message = "Model is not an embedding model".to_string();
             tracing::error!("{message}");
             return Err(TextEmbeddingsError::Backend(BackendError::Inference(
                 message,
@@ -185,8 +185,7 @@ impl Infer {
     ) -> Result<InferResponse, TextEmbeddingsError> {
         if !self.is_classifier() {
             metrics::increment_counter!("te_request_failure", "err" => "model_type");
-            let message = "model is not a classifier model".to_string();
-            // tracing::error!("{message}");
+            let message = "Model is not a classifier model".to_string();
             return Err(TextEmbeddingsError::Backend(BackendError::Inference(
                 message,
             )));
@@ -313,22 +312,21 @@ async fn backend_task(
     mut embed_receiver: mpsc::UnboundedReceiver<(NextBatch, oneshot::Sender<()>)>,
 ) {
     while let Some((batch, _callback)) = embed_receiver.recv().await {
-        let inference_start = Instant::now();
         let results = match &backend.model_type {
             ModelType::Classifier => backend.predict(batch.1).await,
             ModelType::Embedding(_) => backend.embed(batch.1).await,
         };
 
         // Handle sending responses in another thread to avoid starving the backend
         tokio::task::spawn_blocking(move || match results {
-            Ok(embeddings) => {
+            Ok((embeddings, inference_duration)) => {
                 batch.0.into_iter().zip(embeddings).for_each(|(m, e)| {
                     let _ = m.response_tx.send(Ok(InferResponse {
                         results: e,
                         prompt_tokens: m.prompt_tokens,
                         tokenization: m.tokenization,
-                        queue: inference_start - m.queue_time,
-                        inference: inference_start.elapsed(),
+                        queue: m.queue_time.elapsed() - inference_duration,
+                        inference: inference_duration,
                     }));
                 });
             }
diff --git a/core/src/tokenization.rs b/core/src/tokenization.rs
@@ -180,6 +180,12 @@ impl From<String> for EncodingInput {
     }
 }
 
+impl From<(String, String)> for EncodingInput {
+    fn from(value: (String, String)) -> Self {
+        Self::Dual(value.0, value.1)
+    }
+}
+
 type TokenizerRequest = (
     EncodingInput,
     bool,
diff --git a/docs/openapi.json b/docs/openapi.json
@@ -348,6 +348,94 @@
           }
         }
       }
+    },
+    "/rerank": {
+      "post": {
+        "tags": [
+          "Text Embeddings Inference"
+        ],
+        "summary": "Get Ranks. Returns a 424 status code if the model is not a Sequence Classification model with",
+        "description": "Get Ranks. Returns a 424 status code if the model is not a Sequence Classification model with\na single class.",
+        "operationId": "rerank",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/RerankRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Ranks",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/RerankResponse"
+                }
+              }
+            }
+          },
+          "413": {
+            "description": "Batch size error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Batch size error",
+                  "error_type": "validation"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Tokenization error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Tokenization error",
+                  "error_type": "tokenizer"
+                }
+              }
+            }
+          },
+          "424": {
+            "description": "Rerank Error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Inference failed",
+                  "error_type": "backend"
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Model is overloaded",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Model is overloaded",
+                  "error_type": "overloaded"
+                }
+              }
+            }
+          }
+        }
+      }
     }
   },
   "components": {
@@ -797,6 +885,74 @@
             "example": "0.5"
           }
         }
+      },
+      "Rank": {
+        "type": "object",
+        "required": [
+          "index",
+          "score"
+        ],
+        "properties": {
+          "index": {
+            "type": "integer",
+            "example": "0",
+            "minimum": 0
+          },
+          "passage": {
+            "type": "string",
+            "default": "null",
+            "example": "Deep Learning is ...",
+            "nullable": true
+          },
+          "score": {
+            "type": "number",
+            "format": "float",
+            "example": "1.0"
+          }
+        }
+      },
+      "RerankRequest": {
+        "type": "object",
+        "required": [
+          "query",
+          "passages"
+        ],
+        "properties": {
+          "passages": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            },
+            "example": [
+              "Deep Learning is ..."
+            ]
+          },
+          "query": {
+            "type": "string",
+            "example": "What is Deep Learning?"
+          },
+          "raw_scores": {
+            "type": "boolean",
+            "default": "false",
+            "example": "false"
+          },
+          "return_passages": {
+            "type": "boolean",
+            "default": "false",
+            "example": "false"
+          },
+          "truncate": {
+            "type": "boolean",
+            "default": "false",
+            "example": "false"
+          }
+        }
+      },
+      "RerankResponse": {
+        "type": "array",
+        "items": {
+          "$ref": "#/components/schemas/Rank"
+        }
       }
     }
   },
diff --git a/docs/source/en/quick_tour.md b/docs/source/en/quick_tour.md
@@ -53,11 +53,13 @@ curl 127.0.0.1:8080/embed \
     -H 'Content-Type: application/json'
 ```
 
-## Sequence Classification
+## Re-rankers
+
+Re-rankers models are Sequence Classification cross-encoders models with a single class that scores the similarity 
+between a query and a passage.
 
-TEI can also be used to deploy Sequence Classification models.
 See [this blogpost](https://blog.llamaindex.ai/boosting-rag-picking-the-best-embedding-reranker-models-42d079022e83) by
-the LlamaIndex team to understand how you can use Sequence Classification models in your RAG pipeline to improve
+the LlamaIndex team to understand how you can use re-rankers models in your RAG pipeline to improve
 downstream performance.
 
 Let's say you want to use `BAAI/bge-reranker-large`:
@@ -70,15 +72,18 @@ volume=$PWD/data
 docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model --revision $revision
 ```
 
-Once you have deployed a model you can use the `predict` endpoint and rank the similarity between a pair of inputs:
+Once you have deployed a model you can use the `rerank` endpoint to rank the similarity between a query and a list
+of passages:
 
 ```bash
-curl 127.0.0.1:8080/predict \
+curl 127.0.0.1:8080/rerank \
     -X POST \
-    -d '{"inputs":["What is Deep Learning?", "Deep learning is..."], "raw_scores": true}' \
+    -d '{"query":"What is Deep Learning?", "passages": ["Deep Learning is not...", "Deep learning is..."], "raw_scores": false}' \
     -H 'Content-Type: application/json'
 ```
 
+## Sequence Classification
+
 You can also use classic Sequence Classification models like `SamLowe/roberta-base-go_emotions`:
 
 ```shell
diff --git a/docs/source/en/supported_models.md b/docs/source/en/supported_models.md
@@ -44,7 +44,7 @@ Below are some examples of the currently supported models:
 To explore the list of best performing text embeddings models, visit the 
 [Massive Text Embedding Benchmark (MTEB) Leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
 
-## Supported sequence classification models
+## Supported re-rankers and sequence classification models
 
 Text Embeddings Inference currently supports CamemBERT, and XLM-RoBERTa Sequence Classification models with absolute positions. 
 
diff --git a/router/src/lib.rs b/router/src/lib.rs
diff --git a/router/src/server.rs b/router/src/server.rs

Original file line number	Diff line number	Diff line change
`@@ -180,6 +180,12 @@ impl From<String> for EncodingInput {`
`180`	`180`	`}`
`181`	`181`	`}`
`182`	`182`
	`183`	`+impl From<(String, String)> for EncodingInput {`
	`184`	`+ fn from(value: (String, String)) -> Self {`
	`185`	`+ Self::Dual(value.0, value.1)`
	`186`	`+ }`
	`187`	`+}`
	`188`	`+`
`183`	`189`	`type TokenizerRequest = (`
`184`	`190`	`EncodingInput,`
`185`	`191`	`bool,`