Skip to content

Commit b41601c

Browse files
v0.4.0
1 parent 4e1f5cd commit b41601c

File tree

10 files changed

+123
-89
lines changed

10 files changed

+123
-89
lines changed

Cargo.lock

Lines changed: 71 additions & 59 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ members = [
1111
resolver = "2"
1212

1313
[workspace.package]
14-
version = "0.3.0"
14+
version = "0.4.0"
1515
edition = "2021"
1616
authors = ["Olivier Dehaene"]
1717
homepage = "https://github.com/huggingface/text-embeddings-inference"

README.md

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ model=BAAI/bge-large-en-v1.5
100100
revision=refs/pr/5
101101
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
102102

103-
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.3.0 --model-id $model --revision $revision
103+
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model --revision $revision
104104
```
105105

106106
And then you can make requests like
@@ -243,13 +243,13 @@ Text Embeddings Inference ships with multiple Docker images that you can use to
243243

244244
| Architecture | Image |
245245
|-------------------------------------|---------------------------------------------------------------------------|
246-
| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-0.3.0 |
246+
| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-0.4.0 |
247247
| Volta | NOT SUPPORTED |
248-
| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-0.3.0 (experimental) |
249-
| Ampere 80 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:0.3.0 |
250-
| Ampere 86 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-0.3.0 |
251-
| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-0.3.0 |
252-
| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-0.3.0 (experimental) |
248+
| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-0.4.0 (experimental) |
249+
| Ampere 80 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:0.4.0 |
250+
| Ampere 86 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-0.4.0 |
251+
| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-0.4.0 |
252+
| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-0.4.0 (experimental) |
253253

254254
**Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues.
255255
You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable.
@@ -278,7 +278,7 @@ model=<your private model>
278278
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
279279
token=<your cli READ token>
280280

281-
docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.3.0 --model-id $model
281+
docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model
282282
```
283283

284284
### Using Sequence Classification models
@@ -293,7 +293,7 @@ model=BAAI/bge-reranker-large
293293
revision=refs/pr/4
294294
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
295295

296-
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.3.0 --model-id $model --revision $revision
296+
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model --revision $revision
297297
```
298298

299299
And then you can rank the similarity between a pair of inputs with:
@@ -309,9 +309,9 @@ You can also use classic Sequence Classification models like `SamLowe/roberta-ba
309309

310310
```shell
311311
model=SamLowe/roberta-base-go_emotions
312-
volume=$PWD/data
312+
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
313313

314-
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.3.0 --model-id $model
314+
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model
315315
```
316316

317317
Once you have deployed the model you can use the `predict` endpoint to get the emotions most associated with an input:

core/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,6 @@ hf-hub = { version = "^0.3.0", features = ["tokio"] }
1010
metrics = "^0.21"
1111
text-embeddings-backend = { path = "../backends" }
1212
thiserror = "^1.0"
13-
tokenizers = { version = "^0.14.1", default-features=false, features=["onig", "esaxx_fast"] }
13+
tokenizers = { version = "^0.15.0", default-features=false, features=["onig", "esaxx_fast"] }
1414
tracing = "^0.1"
1515
tokio = { version = "^1.25", features = ["rt", "rt-multi-thread", "parking_lot", "sync"] }

docs/openapi.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"license": {
1010
"name": "HFOIL"
1111
},
12-
"version": "0.3.0"
12+
"version": "0.4.0"
1313
},
1414
"paths": {
1515
"/embed": {

docs/source/en/private_models.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,5 @@ model=<your private model>
3737
volume=$PWD/data
3838
token=<your cli Hugging Face Hub token>
3939

40-
docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.2.2 --model-id $model
40+
docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model
4141
```

docs/source/en/quick_tour.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ model=BAAI/bge-large-en-v1.5
3434
revision=refs/pr/5
3535
volume=$PWD/data
3636

37-
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.2.2 --model-id $model --revision $revision
37+
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model --revision $revision
3838
```
3939

4040
<Tip>
@@ -67,7 +67,7 @@ model=BAAI/bge-reranker-large
6767
revision=refs/pr/4
6868
volume=$PWD/data
6969

70-
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.3.0 --model-id $model --revision $revision
70+
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model --revision $revision
7171
```
7272

7373
Once you have deployed a model you can use the `predict` endpoint and rank the similarity between a pair of inputs:
@@ -85,7 +85,7 @@ You can also use classic Sequence Classification models like `SamLowe/roberta-ba
8585
model=SamLowe/roberta-base-go_emotions
8686
volume=$PWD/data
8787

88-
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.3.0 --model-id $model
88+
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model
8989
```
9090

9191
Once you have deployed the model you can use the `predict` endpoint to get the emotions most associated with an input:

docs/source/en/supported_models.md

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,15 @@ NVIDIA drivers with CUDA version 12.2 or higher.
6868

6969
Find the appropriate Docker image for your hardware in the following table:
7070

71-
| Architecture | Image |
72-
|-------------------------------------|------------------------------------------------------------|
73-
| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-0.2.2 |
74-
| Volta | NOT SUPPORTED |
75-
| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-0.2.2 |
76-
| Ampere 80 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:0.2.2 |
77-
| Ampere 86 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-0.2.2 |
78-
| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-0.2.2 |
79-
| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-0.2.2 |
80-
71+
| Architecture | Image |
72+
|-------------------------------------|---------------------------------------------------------------------------|
73+
| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-0.4.0 |
74+
| Volta | NOT SUPPORTED |
75+
| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-0.4.0 (experimental) |
76+
| Ampere 80 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:0.4.0 |
77+
| Ampere 86 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-0.4.0 |
78+
| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-0.4.0 |
79+
| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-0.4.0 (experimental) |
80+
81+
**Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues.
82+
You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable.

router/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ reqwest = { version = "0.11.14", features = [] }
3535
serde = "1.0.152"
3636
serde_json = "1.0.93"
3737
thiserror = "1.0.38"
38-
tokenizers = { version = "0.14.1", default-features=false, features=["onig", "esaxx_fast"] }
38+
tokenizers = { version = "0.15.0", default-features=false, features=["onig", "esaxx_fast"] }
3939
tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
4040
tower-http = { version = "0.4.0", features = ["cors"] }
4141
tracing = "0.1.37"

router/src/main.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ use text_embeddings_core::infer::Infer;
1919
use text_embeddings_core::queue::Queue;
2020
use text_embeddings_core::tokenization::Tokenization;
2121
use text_embeddings_router::{server, ClassifierModel, EmbeddingModel, Info, ModelType};
22-
use tokenizers::Tokenizer;
22+
use tokenizers::decoders::metaspace::PrependScheme;
23+
use tokenizers::{PreTokenizerWrapper, Tokenizer};
2324
use tower_http::cors::AllowOrigin;
2425
use tracing_subscriber::layer::SubscriberExt;
2526
use tracing_subscriber::util::SubscriberInitExt;
@@ -255,6 +256,25 @@ async fn main() -> Result<()> {
255256
let mut tokenizer = Tokenizer::from_file(tokenizer_path).expect(
256257
"tokenizer.json not found. text-embeddings-inference only supports fast tokenizers",
257258
);
259+
// See https://github.com/huggingface/tokenizers/pull/1357
260+
if let Some(pre_tokenizer) = tokenizer.get_pre_tokenizer() {
261+
if let PreTokenizerWrapper::Metaspace(m) = pre_tokenizer {
262+
// We are forced to clone since `Tokenizer` does not have a `get_mut` for `pre_tokenizer`
263+
let mut m = m.clone();
264+
m.set_prepend_scheme(PrependScheme::First);
265+
tokenizer.with_pre_tokenizer(PreTokenizerWrapper::Metaspace(m));
266+
} else if let PreTokenizerWrapper::Sequence(s) = pre_tokenizer {
267+
// We are forced to clone since `Tokenizer` does not have a `get_mut` for `pre_tokenizer`
268+
let mut s = s.clone();
269+
for pre_tokenizer in s.get_pre_tokenizers_mut() {
270+
if let PreTokenizerWrapper::Metaspace(m) = pre_tokenizer {
271+
m.set_prepend_scheme(PrependScheme::First);
272+
}
273+
}
274+
tokenizer.with_pre_tokenizer(PreTokenizerWrapper::Sequence(s));
275+
}
276+
}
277+
258278
tokenizer.with_padding(None);
259279

260280
// Position IDs offset. Used for Roberta and camembert.

0 commit comments

Comments
 (0)