Skip to content

Commit 7b9245d

Browse files
feat(onnx): add onnx runtime for better CPU perf (#328)
1 parent 052037c commit 7b9245d

File tree

16 files changed

+626
-210
lines changed

16 files changed

+626
-210
lines changed

Cargo.lock

Lines changed: 83 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
members = [
33
"backends",
44
"backends/candle",
5+
"backends/ort",
56
"backends/core",
67
"backends/python",
78
"backends/grpc-client",

Dockerfile

Lines changed: 4 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -28,22 +28,9 @@ ARG ACTIONS_CACHE_URL
2828
ARG ACTIONS_RUNTIME_TOKEN
2929
ARG SCCACHE_GHA_ENABLED
3030

31-
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
32-
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
33-
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | \
34-
tee /etc/apt/sources.list.d/oneAPI.list
35-
36-
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
37-
intel-oneapi-mkl-devel=2024.0.0-49656 \
38-
build-essential \
39-
&& rm -rf /var/lib/apt/lists/*
40-
41-
RUN echo "int mkl_serv_intel_cpu_true() {return 1;}" > fakeintel.c && \
42-
gcc -shared -fPIC -o libfakeintel.so fakeintel.c
43-
4431
COPY --from=planner /usr/src/recipe.json recipe.json
4532

46-
RUN cargo chef cook --release --features candle --features mkl-dynamic --no-default-features --recipe-path recipe.json && sccache -s
33+
RUN cargo chef cook --release --features ort --no-default-features --recipe-path recipe.json && sccache -s
4734

4835
COPY backends backends
4936
COPY core core
@@ -53,7 +40,7 @@ COPY Cargo.lock ./
5340

5441
FROM builder as http-builder
5542

56-
RUN cargo build --release --bin text-embeddings-router -F candle -F mkl-dynamic -F http --no-default-features && sccache -s
43+
RUN cargo build --release --bin text-embeddings-router -F ort -F http --no-default-features && sccache -s
5744

5845
FROM builder as grpc-builder
5946

@@ -65,35 +52,18 @@ RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
6552

6653
COPY proto proto
6754

68-
RUN cargo build --release --bin text-embeddings-router -F grpc -F candle -F mkl-dynamic --no-default-features && sccache -s
55+
RUN cargo build --release --bin text-embeddings-router -F grpc -F ort --no-default-features && sccache -s
6956

7057
FROM debian:bookworm-slim as base
7158

7259
ENV HUGGINGFACE_HUB_CACHE=/data \
73-
PORT=80 \
74-
MKL_ENABLE_INSTRUCTIONS=AVX512_E4 \
75-
RAYON_NUM_THREADS=8 \
76-
LD_PRELOAD=/usr/local/libfakeintel.so \
77-
LD_LIBRARY_PATH=/usr/local/lib
60+
PORT=80
7861

7962
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
80-
libomp-dev \
8163
ca-certificates \
8264
libssl-dev \
83-
curl \
8465
&& rm -rf /var/lib/apt/lists/*
8566

86-
# Copy a lot of the Intel shared objects because of the mkl_serv_intel_cpu_true patch...
87-
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_intel_lp64.so.2 /usr/local/lib/libmkl_intel_lp64.so.2
88-
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_intel_thread.so.2 /usr/local/lib/libmkl_intel_thread.so.2
89-
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_core.so.2 /usr/local/lib/libmkl_core.so.2
90-
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_vml_def.so.2 /usr/local/lib/libmkl_vml_def.so.2
91-
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_def.so.2 /usr/local/lib/libmkl_def.so.2
92-
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_vml_avx2.so.2 /usr/local/lib/libmkl_vml_avx2.so.2
93-
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_vml_avx512.so.2 /usr/local/lib/libmkl_vml_avx512.so.2
94-
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_avx2.so.2 /usr/local/lib/libmkl_avx2.so.2
95-
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_avx512.so.2 /usr/local/lib/libmkl_avx512.so.2
96-
COPY --from=builder /usr/src/libfakeintel.so /usr/local/libfakeintel.so
9767

9868
FROM base as grpc
9969

Dockerfile-arm64

Lines changed: 0 additions & 92 deletions
This file was deleted.

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,9 @@ Below are some examples of the currently supported models:
7272
| MTEB Rank | Model Size | Model Type | Model ID |
7373
|-----------|---------------------|-------------|--------------------------------------------------------------------------------------------------|
7474
| 1 | 7B (Very Expensive) | Mistral | [Salesforce/SFR-Embedding-2_R](https://hf.co/Salesforce/SFR-Embedding-2_R) |
75-
| 2 | 7B (Very Expensive) | Qwen2 | [Alibaba-NLP/gte-Qwen2-7B-instruct](https://hf.co/Alibaba-NLP/gte-Qwen2-7B-instruct) |
76-
| 9 | 1.5B (Expensive) | Qwen2 | [Alibaba-NLP/gte-Qwen2-1.5B-instruct](https://hf.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct) |
77-
| 15 | 0.4B | Alibaba GTE | [Alibaba-NLP/gte-large-en-v1.5](Alibaba-NLP/gte-large-en-v1.5) |
75+
| 2 | 7B (Very Expensive) | Qwen2 | [Alibaba-NLP/gte-Qwen2-7B-instruct](https://hf.co/Alibaba-NLP/gte-Qwen2-7B-instruct) |
76+
| 9 | 1.5B (Expensive) | Qwen2 | [Alibaba-NLP/gte-Qwen2-1.5B-instruct](https://hf.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct) |
77+
| 15 | 0.4B | Alibaba GTE | [Alibaba-NLP/gte-large-en-v1.5](https://hf.co/Alibaba-NLP/gte-large-en-v1.5) |
7878
| 20 | 0.3B | Bert | [WhereIsAI/UAE-Large-V1](https://hf.co/WhereIsAI/UAE-Large-V1) |
7979
| 24 | 0.5B | XLM-RoBERTa | [intfloat/multilingual-e5-large-instruct](https://hf.co/intfloat/multilingual-e5-large-instruct) |
8080
| N/A | 0.1B | NomicBert | [nomic-ai/nomic-embed-text-v1](https://hf.co/nomic-ai/nomic-embed-text-v1) |
@@ -568,7 +568,7 @@ supported via Docker. As such inference will be CPU bound and most likely pretty
568568
M1/M2 ARM CPU.
569569

570570
```
571-
docker build . -f Dockerfile-arm64 --platform=linux/arm64
571+
docker build . -f Dockerfile --platform=linux/arm64
572572
```
573573

574574
## Examples

backends/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,19 @@ homepage.workspace = true
77

88
[dependencies]
99
clap = { workspace = true, optional = true }
10+
hf-hub = { workspace = true }
11+
serde_json = { workspace = true }
1012
text-embeddings-backend-core = { path = "core" }
1113
text-embeddings-backend-python = { path = "python", optional = true }
1214
text-embeddings-backend-candle = { path = "candle", optional = true }
15+
text-embeddings-backend-ort = { path = "ort", optional = true }
1316
tokio = { workspace = true }
1417
tracing = { workspace = true }
1518

1619
[features]
1720
clap = ["dep:clap", "text-embeddings-backend-core/clap"]
1821
python = ["dep:text-embeddings-backend-python"]
22+
ort = ["dep:text-embeddings-backend-ort"]
1923
candle = ["dep:text-embeddings-backend-candle"]
2024
cuda = ["text-embeddings-backend-candle?/cuda"]
2125
metal = ["text-embeddings-backend-candle?/metal"]

backends/ort/Cargo.toml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
[package]
2+
name = "text-embeddings-backend-ort"
3+
version.workspace = true
4+
edition.workspace = true
5+
authors.workspace = true
6+
homepage.workspace = true
7+
8+
[dependencies]
9+
anyhow = { workspace = true }
10+
nohash-hasher = { workspace = true }
11+
ndarray = "0.15.6"
12+
ort = { version = "2.0.0-rc.2", default-features = false, features = ["download-binaries", "half", "onednn", "ndarray"] }
13+
text-embeddings-backend-core = { path = "../core" }
14+
tracing = { workspace = true }
15+
thiserror = { workspace = true }
16+
serde = { workspace = true }
17+
serde_json = { workspace = true }

0 commit comments

Comments
 (0)