Skip to content

Commit b718452

Browse files
Merge pull request #1 from huggingface/feat/ci_caching
ci: add caching
2 parents b26fd46 + 297257b commit b718452

File tree

5 files changed

+140
-95
lines changed

5 files changed

+140
-95
lines changed

.github/workflows/build.yaml

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -78,30 +78,29 @@
7878
install: true
7979
- name: Inject slug/short variables
8080
uses: rlespinasse/github-slug-action@v4.4.1
81-
# - name: Tailscale
82-
# uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
83-
# with:
84-
# authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
81+
- name: Tailscale
82+
uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
83+
with:
84+
authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
8585
- name: Login to GitHub Container Registry
86-
# if: github.event_name != 'pull_request'
86+
if: github.event_name != 'pull_request'
8787
uses: docker/login-action@v2
8888
with:
8989
registry: ghcr.io
9090
username: ${{ github.actor }}
9191
password: ${{ secrets.GITHUB_TOKEN }}
92-
# - name: Login to internal Container Registry
93-
# uses: docker/login-action@v2.1.0
94-
# with:
95-
# username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }}
96-
# password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }}
97-
# registry: registry.internal.huggingface.tech
92+
- name: Login to internal Container Registry
93+
uses: docker/login-action@v2.1.0
94+
with:
95+
username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }}
96+
password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }}
97+
registry: registry.internal.huggingface.tech
9898
- name: Extract metadata (tags, labels) for Docker
9999
id: meta-75
100100
uses: docker/metadata-action@v4.3.0
101101
with:
102-
flavor: |
103-
latest=auto
104102
images: |
103+
registry.internal.huggingface.tech/api-inference/text-embeddings-inference
105104
ghcr.io/huggingface/text-embeddings-inference
106105
tags: |
107106
type=semver,pattern=turing-{{version}}
@@ -112,9 +111,8 @@
112111
id: meta-80
113112
uses: docker/metadata-action@v4.3.0
114113
with:
115-
flavor: |
116-
latest=auto
117114
images: |
115+
registry.internal.huggingface.tech/api-inference/text-embeddings-inference
118116
ghcr.io/huggingface/text-embeddings-inference
119117
tags: |
120118
type=semver,pattern={{version}}
@@ -125,9 +123,8 @@
125123
id: meta-86
126124
uses: docker/metadata-action@v4.3.0
127125
with:
128-
flavor: |
129-
latest=auto
130126
images: |
127+
registry.internal.huggingface.tech/api-inference/text-embeddings-inference
131128
ghcr.io/huggingface/text-embeddings-inference
132129
tags: |
133130
type=semver,pattern=86-{{version}}
@@ -138,9 +135,8 @@
138135
id: meta-90
139136
uses: docker/metadata-action@v4.3.0
140137
with:
141-
flavor: |
142-
latest=auto
143138
images: |
139+
registry.internal.huggingface.tech/api-inference/text-embeddings-inference
144140
ghcr.io/huggingface/text-embeddings-inference
145141
tags: |
146142
type=semver,pattern=hopper-{{version}}
@@ -151,9 +147,8 @@
151147
id: meta-cpu
152148
uses: docker/metadata-action@v4.3.0
153149
with:
154-
flavor: |
155-
latest=auto
156150
images: |
151+
registry.internal.huggingface.tech/api-inference/text-embeddings-inference
157152
ghcr.io/huggingface/text-embeddings-inference
158153
tags: |
159154
type=semver,pattern=cpu-{{version}}
@@ -174,6 +169,8 @@
174169
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
175170
tags: ${{ steps.meta-75.outputs.tags }}
176171
labels: ${{ steps.meta-75.outputs.labels }}
172+
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-75,mode=min
173+
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-75,mode=min
177174
- name: Build and push Docker image
178175
id: build-and-push-80
179176
uses: docker/build-push-action@v4
@@ -188,6 +185,8 @@
188185
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
189186
tags: ${{ steps.meta-80.outputs.tags }}
190187
labels: ${{ steps.meta-80.outputs.labels }}
188+
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-80,mode=min
189+
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-80,mode=min
191190
- name: Build and push Docker image
192191
id: build-and-push-86
193192
uses: docker/build-push-action@v4
@@ -202,6 +201,8 @@
202201
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
203202
tags: ${{ steps.meta-86.outputs.tags }}
204203
labels: ${{ steps.meta-86.outputs.labels }}
204+
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-86,mode=min
205+
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-86,mode=min
205206
- name: Build and push Docker image
206207
id: build-and-push-90
207208
uses: docker/build-push-action@v4
@@ -216,6 +217,8 @@
216217
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
217218
tags: ${{ steps.meta-90.outputs.tags }}
218219
labels: ${{ steps.meta-90.outputs.labels }}
220+
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-90,mode=min
221+
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-90,mode=min
219222
- name: Build and push Docker image
220223
id: build-and-push-cpu
221224
uses: docker/build-push-action@v4
@@ -229,6 +232,8 @@
229232
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
230233
tags: ${{ steps.meta-cpu.outputs.tags }}
231234
labels: ${{ steps.meta-cpu.outputs.labels }}
235+
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-cpu,mode=min
236+
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-cpu,mode=min
232237

233238
stop-runner:
234239
name: Stop self-hosted EC2 runner

README.md

Lines changed: 99 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,108 @@ curl 127.0.0.1:8080/embed \
8989
We also recommend using NVIDIA drivers with CUDA version 12 or higher.
9090

9191
To see all options to serve your models:
92-
```
92+
93+
```shell
9394
text-embeddings-router --help
9495
```
9596

97+
```
98+
Usage: text-embeddings-router [OPTIONS]
99+
100+
Options:
101+
--model-id <MODEL_ID>
102+
The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `thenlper/gte-base`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
103+
104+
[env: MODEL_ID=]
105+
[default: thenlper/gte-base]
106+
107+
--revision <REVISION>
108+
The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
109+
110+
[env: REVISION=]
111+
112+
--tokenization-workers <TOKENIZATION_WORKERS>
113+
The number of tokenizer workers used for payload validation and truncation inside the router
114+
115+
[env: TOKENIZATION_WORKERS=]
116+
[default: 8]
117+
118+
--dtype <DTYPE>
119+
The dtype to be forced upon the model
120+
121+
[env: DTYPE=]
122+
[default: float16]
123+
[possible values: float16, float32]
124+
125+
--max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
126+
The maximum amount of concurrent requests for this particular deployment. Having a low limit will refuse clients requests instead of having them wait for too long and is usually good to handle backpressure correctly
127+
128+
[env: MAX_CONCURRENT_REQUESTS=]
129+
[default: 512]
130+
131+
--max-batch-tokens <MAX_BATCH_TOKENS>
132+
**IMPORTANT** This is one critical control to allow maximum usage of the available hardware.
133+
134+
This represents the total amount of potential tokens within a batch.
135+
136+
For `max_batch_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
137+
138+
Overall this number should be the largest possible until the model is compute bound. Since the actual memory overhead depends on the model implementation, text-embeddings-inference cannot infer this number automatically.
139+
140+
[env: MAX_BATCH_TOKENS=]
141+
[default: 16384]
142+
143+
--max-batch-requests <MAX_BATCH_REQUESTS>
144+
Optionally control the maximum number of individual requests in a batch
145+
146+
[env: MAX_BATCH_REQUESTS=]
147+
148+
--max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
149+
Control the maximum number of inputs that a client can send in a single request
150+
151+
[env: MAX_CLIENT_BATCH_SIZE=]
152+
[default: 32]
153+
154+
--hf-api-token <HF_API_TOKEN>
155+
Your HuggingFace hub token
156+
157+
[env: HF_API_TOKEN=]
158+
159+
--hostname <HOSTNAME>
160+
The IP address to listen on
161+
162+
[env: HOSTNAME=]
163+
[default: 0.0.0.0]
164+
165+
-p, --port <PORT>
166+
The port to listen on
167+
168+
[env: PORT=]
169+
[default: 3000]
170+
171+
--uds-path <UDS_PATH>
172+
The name of the unix socket some text-embeddings-inference backends will use as they communicate internally with gRPC
173+
174+
[env: UDS_PATH=]
175+
[default: /tmp/text-embeddings-inference-server]
176+
177+
--huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>
178+
The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
179+
180+
[env: HUGGINGFACE_HUB_CACHE=/data]
181+
182+
--json-output
183+
Outputs the logs in JSON format (useful for telemetry)
184+
185+
[env: JSON_OUTPUT=]
186+
187+
--otlp-endpoint <OTLP_ENDPOINT>
188+
[env: OTLP_ENDPOINT=]
189+
190+
--cors-allow-origin <CORS_ALLOW_ORIGIN>
191+
[env: CORS_ALLOW_ORIGIN=]
192+
```
193+
96194
### Docker Images
97195

98196
Text Embeddings Inference ships with multiple Docker images that you can use to target a specific backend:

backends/candle/src/compute_cap.rs

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
1+
use candle::cuda_backend::cudarc::driver::sys::CUdevice_attribute::{
2+
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
3+
};
4+
use candle::cuda_backend::cudarc::driver::CudaDevice;
15
use lazy_static::lazy_static;
26

37
lazy_static! {
48
pub static ref RUNTIME_COMPUTE_CAP: usize = {
5-
let out = std::process::Command::new("nvidia-smi")
6-
.arg("--query-gpu=compute_cap")
7-
.arg("--format=csv")
8-
.output()
9+
let device = CudaDevice::new(0).expect("cuda is not available");
10+
let major = device
11+
.attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)
912
.unwrap();
10-
let out = std::str::from_utf8(&out.stdout).unwrap();
11-
let mut lines = out.lines();
12-
assert_eq!(lines.next().unwrap(), "compute_cap");
13-
let cap = lines.next().unwrap().replace('.', "");
14-
cap.parse::<usize>().unwrap()
13+
let minor = device
14+
.attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)
15+
.unwrap();
16+
(major * 10 + minor) as usize
1517
};
1618
pub static ref COMPILE_COMPUTE_CAP: usize = env!("CUDA_COMPUTE_CAP").parse::<usize>().unwrap();
1719
}

eval.py

Lines changed: 0 additions & 55 deletions
This file was deleted.

router/src/main.rs

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -60,26 +60,21 @@ struct Args {
6060
/// of the available hardware.
6161
///
6262
/// This represents the total amount of potential tokens within a batch.
63-
/// When using padding (not recommended) this would be equivalent of
64-
/// `batch_size` * `max_total_tokens`.
6563
///
66-
/// However in the non-padded (flash attention) version this can be much finer.
67-
///
68-
/// For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100`
64+
/// For `max_batch_tokens=1000`, you could fit `10` queries of `total_tokens=100`
6965
/// or a single query of `1000` tokens.
7066
///
7167
/// Overall this number should be the largest possible until the model is compute bound.
72-
/// Since the actual memory overhead depends on other parameters like if you're flash attention
73-
/// or the model implementation, text-embeddings-inference cannot infer this number
74-
/// automatically.
68+
/// Since the actual memory overhead depends on the model implementation,
69+
/// text-embeddings-inference cannot infer this number automatically.
7570
#[clap(default_value = "16384", long, env)]
7671
max_batch_tokens: usize,
7772

7873
/// Optionally control the maximum number of individual requests in a batch
7974
#[clap(long, env)]
8075
max_batch_requests: Option<usize>,
8176

82-
/// Control the maximum number of inputs that a client can send
77+
/// Control the maximum number of inputs that a client can send in a single request
8378
#[clap(default_value = "32", long, env)]
8479
max_client_batch_size: usize,
8580

0 commit comments

Comments
 (0)