Skip to content

Commit c0209ff

Browse files
v0.5.0
1 parent 8e85e9c commit c0209ff

File tree

10 files changed

+58
-88
lines changed

10 files changed

+58
-88
lines changed

Cargo.lock

Lines changed: 29 additions & 57 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ members = [
1111
resolver = "2"
1212

1313
[workspace.package]
14-
version = "0.4.0"
14+
version = "0.5.0"
1515
edition = "2021"
1616
authors = ["Olivier Dehaene"]
1717
homepage = "https://github.com/huggingface/text-embeddings-inference"

README.md

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ model=BAAI/bge-large-en-v1.5
101101
revision=refs/pr/5
102102
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
103103

104-
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model --revision $revision
104+
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.5 --model-id $model --revision $revision
105105
```
106106

107107
And then you can make requests like
@@ -242,15 +242,15 @@ Options:
242242

243243
Text Embeddings Inference ships with multiple Docker images that you can use to target a specific backend:
244244

245-
| Architecture | Image |
246-
|-------------------------------------|---------------------------------------------------------------------------|
247-
| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-0.4.0 |
248-
| Volta | NOT SUPPORTED |
249-
| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-0.4.0 (experimental) |
250-
| Ampere 80 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:0.4.0 |
251-
| Ampere 86 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-0.4.0 |
252-
| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-0.4.0 |
253-
| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-0.4.0 (experimental) |
245+
| Architecture | Image |
246+
|-------------------------------------|-------------------------------------------------------------------------|
247+
| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-0.5 |
248+
| Volta | NOT SUPPORTED |
249+
| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-0.5 (experimental) |
250+
| Ampere 80 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:0.5 |
251+
| Ampere 86 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-0.5 |
252+
| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-0.5 |
253+
| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-0.5 (experimental) |
254254

255255
**Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues.
256256
You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable.
@@ -279,7 +279,7 @@ model=<your private model>
279279
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
280280
token=<your cli READ token>
281281

282-
docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model
282+
docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.5 --model-id $model
283283
```
284284

285285
### Using Re-rankers models
@@ -297,7 +297,7 @@ model=BAAI/bge-reranker-large
297297
revision=refs/pr/4
298298
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
299299

300-
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model --revision $revision
300+
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.5 --model-id $model --revision $revision
301301
```
302302

303303
And then you can rank the similarity between a query and a list of passages with:
@@ -317,7 +317,7 @@ You can also use classic Sequence Classification models like `SamLowe/roberta-ba
317317
model=SamLowe/roberta-base-go_emotions
318318
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
319319

320-
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model
320+
docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.5 --model-id $model
321321
```
322322

323323
Once you have deployed the model you can use the `predict` endpoint to get the emotions most associated with an input:

backends/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ authors.workspace = true
66
homepage.workspace = true
77

88
[dependencies]
9-
flume = "^0.11"
109
clap = { version = "4.1.4", features = ["derive"], optional = true }
1110
text-embeddings-backend-core = { path = "core" }
1211
text-embeddings-backend-python = { path = "python", optional = true }

backends/src/lib.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use std::sync::atomic::{AtomicBool, Ordering};
55
use std::sync::Arc;
66
use std::time::{Duration, Instant};
77
use text_embeddings_backend_core::Backend as CoreBackend;
8-
use tokio::sync::oneshot;
8+
use tokio::sync::{mpsc, oneshot};
99
use tracing::{instrument, Span};
1010

1111
pub use crate::dtype::DType;
@@ -20,7 +20,7 @@ use text_embeddings_backend_python::PythonBackend;
2020
#[derive(Debug, Clone)]
2121
pub struct Backend {
2222
/// Channel to communicate with the background thread
23-
backend_sender: flume::Sender<BackendCommand>,
23+
backend_sender: mpsc::UnboundedSender<BackendCommand>,
2424
/// Health status
2525
health: Arc<AtomicBool>,
2626
pub max_batch_size: Option<usize>,
@@ -35,7 +35,7 @@ impl Backend {
3535
uds_path: String,
3636
otlp_endpoint: Option<String>,
3737
) -> Result<Self, BackendError> {
38-
let (backend_sender, backend_receiver) = flume::unbounded();
38+
let (backend_sender, backend_receiver) = mpsc::unbounded_channel();
3939

4040
let backend = init_backend(
4141
model_path,
@@ -164,9 +164,9 @@ fn init_backend(
164164

165165
fn backend_blocking_task(
166166
backend: Box<dyn CoreBackend + Send>,
167-
command_receiver: flume::Receiver<BackendCommand>,
167+
mut command_receiver: mpsc::UnboundedReceiver<BackendCommand>,
168168
) {
169-
while let Ok(cmd) = command_receiver.recv() {
169+
while let Some(cmd) = command_receiver.blocking_recv() {
170170
let start = Instant::now();
171171
match cmd {
172172
BackendCommand::Health(span, sender) => {

docs/openapi.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"license": {
1010
"name": "HFOIL"
1111
},
12-
"version": "0.4.0"
12+
"version": "0.5"
1313
},
1414
"paths": {
1515
"/embed": {

docs/source/en/private_models.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,5 @@ model=<your private model>
3737
volume=$PWD/data
3838
token=<your cli Hugging Face Hub token>
3939

40-
docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.4.0 --model-id $model
40+
docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.5 --model-id $model
4141
```

0 commit comments

Comments
 (0)