Skip to content

Commit 2defbcc

Browse files
relly on driver not on nvidia-smi
1 parent 23f5f7f commit 2defbcc

File tree

3 files changed

+122
-20
lines changed

3 files changed

+122
-20
lines changed

.github/workflows/build.yaml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,8 @@
169169
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
170170
tags: ${{ steps.meta-75.outputs.tags }}
171171
labels: ${{ steps.meta-75.outputs.labels }}
172-
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
173-
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
172+
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-75,mode=min
173+
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-75,mode=min
174174
- name: Build and push Docker image
175175
id: build-and-push-80
176176
uses: docker/build-push-action@v4
@@ -185,8 +185,8 @@
185185
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
186186
tags: ${{ steps.meta-80.outputs.tags }}
187187
labels: ${{ steps.meta-80.outputs.labels }}
188-
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
189-
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
188+
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-80,mode=min
189+
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-80,mode=min
190190
- name: Build and push Docker image
191191
id: build-and-push-86
192192
uses: docker/build-push-action@v4
@@ -201,8 +201,8 @@
201201
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
202202
tags: ${{ steps.meta-86.outputs.tags }}
203203
labels: ${{ steps.meta-86.outputs.labels }}
204-
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
205-
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
204+
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-86,mode=min
205+
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-86,mode=min
206206
- name: Build and push Docker image
207207
id: build-and-push-90
208208
uses: docker/build-push-action@v4
@@ -217,8 +217,8 @@
217217
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
218218
tags: ${{ steps.meta-90.outputs.tags }}
219219
labels: ${{ steps.meta-90.outputs.labels }}
220-
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
221-
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
220+
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-90,mode=min
221+
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-90,mode=min
222222
- name: Build and push Docker image
223223
id: build-and-push-cpu
224224
uses: docker/build-push-action@v4
@@ -232,8 +232,8 @@
232232
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
233233
tags: ${{ steps.meta-cpu.outputs.tags }}
234234
labels: ${{ steps.meta-cpu.outputs.labels }}
235-
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
236-
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
235+
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-cpu,mode=min
236+
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-cpu,mode=min
237237

238238
stop-runner:
239239
name: Stop self-hosted EC2 runner

README.md

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,110 @@ curl 127.0.0.1:8080/embed \
8989
We also recommend using NVIDIA drivers with CUDA version 12 or higher.
9090

9191
To see all options to serve your models:
92-
```
92+
93+
```shell
9394
text-embeddings-router --help
9495
```
9596

97+
```
98+
Usage: text-embeddings-router [OPTIONS]
99+
100+
Options:
101+
--model-id <MODEL_ID>
102+
The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `thenlper/gte-base`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
103+
104+
[env: MODEL_ID=]
105+
[default: thenlper/gte-base]
106+
107+
--revision <REVISION>
108+
The actual revision of the model if you are referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
109+
110+
[env: REVISION=]
111+
112+
--tokenization-workers <TOKENIZATION_WORKERS>
113+
The number of tokenizer workers used for payload validation and truncation inside the router
114+
115+
[env: TOKENIZATION_WORKERS=]
116+
[default: 8]
117+
118+
--dtype <DTYPE>
119+
The dtype to be forced upon the model
120+
121+
[env: DTYPE=]
122+
[default: float16]
123+
[possible values: float16, float32]
124+
125+
--max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
126+
The maximum amount of concurrent requests for this particular deployment. Having a low limit will refuse clients requests instead of having them wait for too long and is usually good to handle backpressure correctly
127+
128+
[env: MAX_CONCURRENT_REQUESTS=]
129+
[default: 512]
130+
131+
--max-batch-tokens <MAX_BATCH_TOKENS>
132+
**IMPORTANT** This is one critical control to allow maximum usage of the available hardware.
133+
134+
This represents the total amount of potential tokens within a batch. When using padding (not recommended) this would be equivalent of `batch_size` * `max_total_tokens`.
135+
136+
However in the non-padded (flash attention) version this can be much finer.
137+
138+
For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
139+
140+
Overall this number should be the largest possible until the model is compute bound. Since the actual memory overhead depends on other parameters like if you are flash attention or the model implementation, text-embeddings cannot infer this number automatically.
141+
142+
[env: MAX_BATCH_TOKENS=]
143+
[default: 8192]
144+
145+
--max-batch-requests <MAX_BATCH_REQUESTS>
146+
Optionally control the maximum number of individual requests in a batch
147+
148+
[env: MAX_BATCH_REQUESTS=]
149+
150+
--max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
151+
Control the maximum number of inputs that a client can send
152+
153+
[env: MAX_CLIENT_BATCH_SIZE=]
154+
[default: 32]
155+
156+
--hf-api-token <HF_API_TOKEN>
157+
Your HuggingFace hub token
158+
159+
[env: HF_API_TOKEN=]
160+
161+
--hostname <HOSTNAME>
162+
The IP address to listen on
163+
164+
[env: HOSTNAME=]
165+
[default: 0.0.0.0]
166+
167+
-p, --port <PORT>
168+
The port to listen on
169+
170+
[env: PORT=]
171+
[default: 3000]
172+
173+
--uds-path <UDS_PATH>
174+
The name of the unix socket some text-embeddings backends will use as they communicate internally with gRPC
175+
176+
[env: UDS_PATH=]
177+
[default: /tmp/text-embeddings-server]
178+
179+
--huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>
180+
The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
181+
182+
[env: HUGGINGFACE_HUB_CACHE=/data]
183+
184+
--json-output
185+
Outputs the logs in JSON format (useful for telemetry)
186+
187+
[env: JSON_OUTPUT=]
188+
189+
--otlp-endpoint <OTLP_ENDPOINT>
190+
[env: OTLP_ENDPOINT=]
191+
192+
--cors-allow-origin <CORS_ALLOW_ORIGIN>
193+
[env: CORS_ALLOW_ORIGIN=]
194+
```
195+
96196
### Docker Images
97197

98198
Text Embeddings Inference ships with multiple Docker images that you can use to target a specific backend:

backends/candle/src/compute_cap.rs

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
1+
use candle::cuda_backend::cudarc::driver::sys::CUdevice_attribute::{
2+
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
3+
};
4+
use candle::cuda_backend::cudarc::driver::CudaDevice;
15
use lazy_static::lazy_static;
26

37
lazy_static! {
48
pub static ref RUNTIME_COMPUTE_CAP: usize = {
5-
let out = std::process::Command::new("nvidia-smi")
6-
.arg("--query-gpu=compute_cap")
7-
.arg("--format=csv")
8-
.output()
9+
let device = CudaDevice::new(0).expect("cuda is not available");
10+
let major = device
11+
.attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)
912
.unwrap();
10-
let out = std::str::from_utf8(&out.stdout).unwrap();
11-
let mut lines = out.lines();
12-
assert_eq!(lines.next().unwrap(), "compute_cap");
13-
let cap = lines.next().unwrap().replace('.', "");
14-
cap.parse::<usize>().unwrap()
13+
let minor = device
14+
.attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)
15+
.unwrap();
16+
(major * 10 + minor) as usize
1517
};
1618
pub static ref COMPILE_COMPUTE_CAP: usize = env!("CUDA_COMPUTE_CAP").parse::<usize>().unwrap();
1719
}

0 commit comments

Comments
 (0)