v1.0.0 (#168)

OlivierDehaene · web-flow · commit 41b692d1afab · 2024-02-23T17:10:30.000+01:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -11,7 +11,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "0.6.0"
+version = "1.0.0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-embeddings-inference"
diff --git a/README.md b/README.md
@@ -67,10 +67,10 @@ with absolute positions in `text-embeddings-inference`.
 
 Examples of supported models:
 
-| MTEB Rank | Model Type  | Example Model ID                                                                                 |
+| MTEB Rank | Model Type  | Model ID                                                                                         |
 |-----------|-------------|--------------------------------------------------------------------------------------------------|
 | 6         | Bert        | [WhereIsAI/UAE-Large-V1](https://hf.co/WhereIsAI/UAE-Large-V1)                                   |
-| 1O        | XLM-RoBERTa | [intfloat/multilingual-e5-large-instruct](https://hf.co/intfloat/multilingual-e5-large-instruct) |
+| 10        | XLM-RoBERTa | [intfloat/multilingual-e5-large-instruct](https://hf.co/intfloat/multilingual-e5-large-instruct) |
 | N/A       | NomicBert   | [nomic-ai/nomic-embed-text-v1](https://hf.co/nomic-ai/nomic-embed-text-v1)                       |
 | N/A       | NomicBert   | [nomic-ai/nomic-embed-text-v1.5](https://hf.co/nomic-ai/nomic-embed-text-v1.5)                   |
 | N/A       | JinaBERT    | [jinaai/jina-embeddings-v2-base-en](https://hf.co/jinaai/jina-embeddings-v2-base-en)             |
@@ -80,7 +80,7 @@ models [here](https://huggingface.co/spaces/mteb/leaderboard).
 
 #### Sequence Classification and Re-Ranking
 
-`text-embeddings-inference` v0.4.0 added support for CamemBERT, RoBERTa and XLM-RoBERTa Sequence Classification models.
+`text-embeddings-inference` v0.4.0 added support for Bert, CamemBERT, RoBERTa and XLM-RoBERTa Sequence Classification models.
 
 Example of supported sequence classification models:
 
@@ -97,7 +97,7 @@ model=BAAI/bge-large-en-v1.5
 revision=refs/pr/5
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.6 --model-id $model --revision $revision
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model --revision $revision
 ```
 
 And then you can make requests like
@@ -242,13 +242,13 @@ Text Embeddings Inference ships with multiple Docker images that you can use to
 
 | Architecture                        | Image                                                                   |
 |-------------------------------------|-------------------------------------------------------------------------|
-| CPU                                 | ghcr.io/huggingface/text-embeddings-inference:cpu-0.6                   |
+| CPU                                 | ghcr.io/huggingface/text-embeddings-inference:cpu-1.0                   |
 | Volta                               | NOT SUPPORTED                                                           |
-| Turing (T4, RTX 2000 series, ...)   | ghcr.io/huggingface/text-embeddings-inference:turing-0.6 (experimental) |
-| Ampere 80 (A100, A30)               | ghcr.io/huggingface/text-embeddings-inference:0.6                       |
-| Ampere 86 (A10, A40, ...)           | ghcr.io/huggingface/text-embeddings-inference:86-0.6                    |
-| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-0.6                    |
-| Hopper (H100)                       | ghcr.io/huggingface/text-embeddings-inference:hopper-0.6 (experimental) |
+| Turing (T4, RTX 2000 series, ...)   | ghcr.io/huggingface/text-embeddings-inference:turing-1.0 (experimental) |
+| Ampere 80 (A100, A30)               | ghcr.io/huggingface/text-embeddings-inference:1.0                       |
+| Ampere 86 (A10, A40, ...)           | ghcr.io/huggingface/text-embeddings-inference:86-1.0                    |
+| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-1.0                    |
+| Hopper (H100)                       | ghcr.io/huggingface/text-embeddings-inference:hopper-1.0 (experimental) |
 
 **Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues.
 You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable.
@@ -277,7 +277,7 @@ model=<your private model>
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 token=<your cli READ token>
 
-docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.6 --model-id $model
+docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model
 ```
 
 ### Using Re-rankers models
@@ -295,7 +295,7 @@ model=BAAI/bge-reranker-large
 revision=refs/pr/4
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.6 --model-id $model --revision $revision
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model --revision $revision
 ```
 
 And then you can rank the similarity between a query and a list of texts with:
@@ -315,7 +315,7 @@ You can also use classic Sequence Classification models like `SamLowe/roberta-ba
 model=SamLowe/roberta-base-go_emotions
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.6 --model-id $model
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model
 ```
 
 Once you have deployed the model you can use the `predict` endpoint to get the emotions most associated with an input:
@@ -344,7 +344,7 @@ model=BAAI/bge-large-en-v1.5
 revision=refs/pr/5
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.6-grpc --model-id $model --revision $revision
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0-grpc --model-id $model --revision $revision
 ```
 
 ```shell
diff --git a/docs/openapi.json b/docs/openapi.json
@@ -9,7 +9,7 @@
     "license": {
       "name": "HFOIL"
     },
-    "version": "0.6.0"
+    "version": "1.0.0"
   },
   "paths": {
     "/embed": {
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
@@ -23,12 +23,12 @@ TEI offers multiple features tailored to optimize the deployment process and enh
 
 **Key Features:**
 
-* **Streamlined Deployment:** TEI eliminates the need for a model graph compilation step for a more efficient deployment process.
+* **Streamlined Deployment:** TEI eliminates the need for a model graph compilation step for an easier deployment process.
 * **Efficient Resource Utilization:** Benefit from small Docker images and rapid boot times, allowing for true serverless capabilities.
 * **Dynamic Batching:** TEI incorporates token-based dynamic batching thus optimizing resource utilization during inference.
 * **Optimized Inference:** TEI leverages [Flash Attention](https://github.com/HazyResearch/flash-attention), [Candle](https://github.com/huggingface/candle), and [cuBLASLt](https://docs.nvidia.com/cuda/cublas/#using-the-cublaslt-api) by using optimized transformers code for inference.
-* **Safetensors weight loading:** TEI loads [Safetensors](https://github.com/huggingface/safetensors) weights to enable tensor parallelism.
-* **Production-Ready:** TEI supports distributed tracing through Open Telemetry and Prometheus metrics.
+* **Safetensors weight loading:** TEI loads [Safetensors](https://github.com/huggingface/safetensors) weights for faster boot times.
+* **Production-Ready:** TEI supports distributed tracing through Open Telemetry and exports Prometheus metrics.
 
 **Benchmarks**
 
diff --git a/docs/source/en/local_cpu.md b/docs/source/en/local_cpu.md
@@ -20,7 +20,7 @@ You can install `text-embeddings-inference` locally to run it on your own machin
 
 ## Step 1: Install Rust
 
-[Install Rust]((https://rustup.rs/) on your machine by run the following in your terminal, then following the instructions:
+[Install Rust](https://rustup.rs/) on your machine by run the following in your terminal, then following the instructions:
 
 ```shell
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
diff --git a/docs/source/en/local_gpu.md b/docs/source/en/local_gpu.md
@@ -31,7 +31,7 @@ export PATH=$PATH:/usr/local/cuda/bin
 
 ## Step 2: Install Rust
 
-[Install Rust]((https://rustup.rs/) on your machine by run the following in your terminal, then following the instructions:
+[Install Rust](https://rustup.rs/) on your machine by run the following in your terminal, then following the instructions:
 
 ```shell
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
diff --git a/docs/source/en/local_metal.md b/docs/source/en/local_metal.md
@@ -21,7 +21,7 @@ Here are the step-by-step instructions for installation:
 
 ## Step 1: Install Rust
 
-[Install Rust]((https://rustup.rs/) on your machine by run the following in your terminal, then following the instructions:
+[Install Rust](https://rustup.rs/) on your machine by run the following in your terminal, then following the instructions:
 
 ```shell
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
diff --git a/docs/source/en/private_models.md b/docs/source/en/private_models.md
@@ -37,5 +37,5 @@ model=<your private model>
 volume=$PWD/data
 token=<your cli Hugging Face Hub token>
 
-docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.6 --model-id $model
+docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model
 ```
diff --git a/docs/source/en/quick_tour.md b/docs/source/en/quick_tour.md
@@ -34,7 +34,7 @@ model=BAAI/bge-large-en-v1.5
 revision=refs/pr/5
 volume=$PWD/data
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.6 --model-id $model --revision $revision
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model --revision $revision
 ```
 
 <Tip>
@@ -69,7 +69,7 @@ model=BAAI/bge-reranker-large
 revision=refs/pr/4
 volume=$PWD/data
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.6 --model-id $model --revision $revision
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model --revision $revision
 ```
 
 Once you have deployed a model you can use the `rerank` endpoint to rank the similarity between a query and a list
@@ -90,7 +90,7 @@ You can also use classic Sequence Classification models like `SamLowe/roberta-ba
 model=SamLowe/roberta-base-go_emotions
 volume=$PWD/data
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.6 --model-id $model
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model
 ```
 
 Once you have deployed the model you can use the `predict` endpoint to get the emotions most associated with an input:
diff --git a/docs/source/en/supported_models.md b/docs/source/en/supported_models.md
@@ -25,10 +25,10 @@ model with Alibi positions.
 
 Below are some examples of the currently supported models:
 
-| MTEB Rank | Model Type  | Example Model ID                                                                                 |
+| MTEB Rank | Model Type  | Model ID                                                                                         |
 |-----------|-------------|--------------------------------------------------------------------------------------------------|
 | 6         | Bert        | [WhereIsAI/UAE-Large-V1](https://hf.co/WhereIsAI/UAE-Large-V1)                                   |
-| 1O        | XLM-RoBERTa | [intfloat/multilingual-e5-large-instruct](https://hf.co/intfloat/multilingual-e5-large-instruct) |
+| 10        | XLM-RoBERTa | [intfloat/multilingual-e5-large-instruct](https://hf.co/intfloat/multilingual-e5-large-instruct) |
 | N/A       | NomicBert   | [nomic-ai/nomic-embed-text-v1](https://hf.co/nomic-ai/nomic-embed-text-v1)                       |
 | N/A       | NomicBert   | [nomic-ai/nomic-embed-text-v1.5](https://hf.co/nomic-ai/nomic-embed-text-v1.5)                   |
 | N/A       | JinaBERT    | [jinaai/jina-embeddings-v2-base-en](https://hf.co/jinaai/jina-embeddings-v2-base-en)             |
@@ -61,15 +61,15 @@ NVIDIA drivers with CUDA version 12.2 or higher.
 
 Find the appropriate Docker image for your hardware in the following table:
 
-| Architecture                        | Image                                                                     |
-|-------------------------------------|---------------------------------------------------------------------------|
-| CPU                                 | ghcr.io/huggingface/text-embeddings-inference:cpu-0.6                     |
-| Volta                               | NOT SUPPORTED                                                             |
-| Turing (T4, RTX 2000 series, ...)   | ghcr.io/huggingface/text-embeddings-inference:turing-0.6 (experimental)   |
-| Ampere 80 (A100, A30)               | ghcr.io/huggingface/text-embeddings-inference:0.6                         |
-| Ampere 86 (A10, A40, ...)           | ghcr.io/huggingface/text-embeddings-inference:86-0.6                      |
-| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-0.6                      |
-| Hopper (H100)                       | ghcr.io/huggingface/text-embeddings-inference:hopper-0.4.0 (experimental) |
+| Architecture                        | Image                                                                    |
+|-------------------------------------|--------------------------------------------------------------------------|
+| CPU                                 | ghcr.io/huggingface/text-embeddings-inference:cpu-1.0                    |
+| Volta                               | NOT SUPPORTED                                                            |
+| Turing (T4, RTX 2000 series, ...)   | ghcr.io/huggingface/text-embeddings-inference:turing-1.0 (experimental)  |
+| Ampere 80 (A100, A30)               | ghcr.io/huggingface/text-embeddings-inference:1.0                        |
+| Ampere 86 (A10, A40, ...)           | ghcr.io/huggingface/text-embeddings-inference:86-1.0                     |
+| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-1.0                     |
+| Hopper (H100)                       | ghcr.io/huggingface/text-embeddings-inference:hopper-1.0 (experimental)  |
 
 **Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues.
 You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable.