Merge pull request #1 from huggingface/feat/ci_caching

OlivierDehaene · web-flow · commit b718452393fd · 2023-10-13T20:54:40.000+02:00
ci: add caching
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -78,30 +78,29 @@
            install: true
        - name: Inject slug/short variables
          uses: rlespinasse/github-slug-action@v4.4.1
-#       - name: Tailscale
-#         uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
-#         with:
-#           authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
+       - name: Tailscale
+         uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
+         with:
+           authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
        - name: Login to GitHub Container Registry
-#         if: github.event_name != 'pull_request'
+         if: github.event_name != 'pull_request'
          uses: docker/login-action@v2
          with:
            registry: ghcr.io
            username: ${{ github.actor }}
            password: ${{ secrets.GITHUB_TOKEN }}
-#       - name: Login to internal Container Registry
-#         uses: docker/login-action@v2.1.0
-#         with:
-#           username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }}
-#           password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }}
-#           registry: registry.internal.huggingface.tech
+       - name: Login to internal Container Registry
+         uses: docker/login-action@v2.1.0
+         with:
+           username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }}
+           password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }}
+           registry: registry.internal.huggingface.tech
        - name: Extract metadata (tags, labels) for Docker
          id: meta-75
          uses: docker/metadata-action@v4.3.0
          with:
-           flavor: |
-             latest=auto
            images: |
+             registry.internal.huggingface.tech/api-inference/text-embeddings-inference
              ghcr.io/huggingface/text-embeddings-inference
            tags: |
              type=semver,pattern=turing-{{version}}
@@ -112,9 +111,8 @@
          id: meta-80
          uses: docker/metadata-action@v4.3.0
          with:
-           flavor: |
-             latest=auto
            images: |
+             registry.internal.huggingface.tech/api-inference/text-embeddings-inference
              ghcr.io/huggingface/text-embeddings-inference
            tags: |
              type=semver,pattern={{version}}
@@ -125,9 +123,8 @@
          id: meta-86
          uses: docker/metadata-action@v4.3.0
          with:
-           flavor: |
-             latest=auto
            images: |
+             registry.internal.huggingface.tech/api-inference/text-embeddings-inference
              ghcr.io/huggingface/text-embeddings-inference
            tags: |
              type=semver,pattern=86-{{version}}
@@ -138,9 +135,8 @@
          id: meta-90
          uses: docker/metadata-action@v4.3.0
          with:
-           flavor: |
-             latest=auto
            images: |
+             registry.internal.huggingface.tech/api-inference/text-embeddings-inference
              ghcr.io/huggingface/text-embeddings-inference
            tags: |
              type=semver,pattern=hopper-{{version}}
@@ -151,9 +147,8 @@
          id: meta-cpu
          uses: docker/metadata-action@v4.3.0
          with:
-           flavor: |
-             latest=auto
            images: |
+             registry.internal.huggingface.tech/api-inference/text-embeddings-inference
              ghcr.io/huggingface/text-embeddings-inference
            tags: |
              type=semver,pattern=cpu-{{version}}
@@ -174,6 +169,8 @@
              DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
            tags: ${{ steps.meta-75.outputs.tags }}
            labels: ${{ steps.meta-75.outputs.labels }}
+           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-75,mode=min
+           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-75,mode=min
        - name: Build and push Docker image
          id: build-and-push-80
          uses: docker/build-push-action@v4
@@ -188,6 +185,8 @@
              DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
            tags: ${{ steps.meta-80.outputs.tags }}
            labels: ${{ steps.meta-80.outputs.labels }}
+           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-80,mode=min
+           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-80,mode=min
        - name: Build and push Docker image
          id: build-and-push-86
          uses: docker/build-push-action@v4
@@ -202,6 +201,8 @@
              DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
            tags: ${{ steps.meta-86.outputs.tags }}
            labels: ${{ steps.meta-86.outputs.labels }}
+           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-86,mode=min
+           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-86,mode=min
        - name: Build and push Docker image
          id: build-and-push-90
          uses: docker/build-push-action@v4
@@ -216,6 +217,8 @@
              DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
            tags: ${{ steps.meta-90.outputs.tags }}
            labels: ${{ steps.meta-90.outputs.labels }}
+           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-90,mode=min
+           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-90,mode=min
        - name: Build and push Docker image
          id: build-and-push-cpu
          uses: docker/build-push-action@v4
@@ -229,6 +232,8 @@
              DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
            tags: ${{ steps.meta-cpu.outputs.tags }}
            labels: ${{ steps.meta-cpu.outputs.labels }}
+           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-cpu,mode=min
+           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-cpu,mode=min
 
    stop-runner:
      name: Stop self-hosted EC2 runner
diff --git a/README.md b/README.md
@@ -89,10 +89,108 @@ curl 127.0.0.1:8080/embed \
 We also recommend using NVIDIA drivers with CUDA version 12 or higher. 
 
 To see all options to serve your models:
-```
+
+```shell
 text-embeddings-router --help
 ```
 
+```
+Usage: text-embeddings-router [OPTIONS]
+
+Options:
+      --model-id <MODEL_ID>
+          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `thenlper/gte-base`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
+
+          [env: MODEL_ID=]
+          [default: thenlper/gte-base]
+
+      --revision <REVISION>
+          The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
+
+          [env: REVISION=]
+
+      --tokenization-workers <TOKENIZATION_WORKERS>
+          The number of tokenizer workers used for payload validation and truncation inside the router
+
+          [env: TOKENIZATION_WORKERS=]
+          [default: 8]
+
+      --dtype <DTYPE>
+          The dtype to be forced upon the model
+
+          [env: DTYPE=]
+          [default: float16]
+          [possible values: float16, float32]
+
+      --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
+          The maximum amount of concurrent requests for this particular deployment. Having a low limit will refuse clients requests instead of having them wait for too long and is usually good to handle backpressure correctly
+
+          [env: MAX_CONCURRENT_REQUESTS=]
+          [default: 512]
+
+      --max-batch-tokens <MAX_BATCH_TOKENS>
+          **IMPORTANT** This is one critical control to allow maximum usage of the available hardware.
+
+          This represents the total amount of potential tokens within a batch.
+
+          For `max_batch_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
+
+          Overall this number should be the largest possible until the model is compute bound. Since the actual memory overhead depends on the model implementation, text-embeddings-inference cannot infer this number automatically.
+
+          [env: MAX_BATCH_TOKENS=]
+          [default: 16384]
+
+      --max-batch-requests <MAX_BATCH_REQUESTS>
+          Optionally control the maximum number of individual requests in a batch
+
+          [env: MAX_BATCH_REQUESTS=]
+
+      --max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
+          Control the maximum number of inputs that a client can send in a single request
+
+          [env: MAX_CLIENT_BATCH_SIZE=]
+          [default: 32]
+
+      --hf-api-token <HF_API_TOKEN>
+          Your HuggingFace hub token
+
+          [env: HF_API_TOKEN=]
+
+      --hostname <HOSTNAME>
+          The IP address to listen on
+
+          [env: HOSTNAME=]
+          [default: 0.0.0.0]
+
+  -p, --port <PORT>
+          The port to listen on
+
+          [env: PORT=]
+          [default: 3000]
+
+      --uds-path <UDS_PATH>
+          The name of the unix socket some text-embeddings-inference backends will use as they communicate internally with gRPC
+
+          [env: UDS_PATH=]
+          [default: /tmp/text-embeddings-inference-server]
+
+      --huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>
+          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
+
+          [env: HUGGINGFACE_HUB_CACHE=/data]
+
+      --json-output
+          Outputs the logs in JSON format (useful for telemetry)
+
+          [env: JSON_OUTPUT=]
+
+      --otlp-endpoint <OTLP_ENDPOINT>
+          [env: OTLP_ENDPOINT=]
+
+      --cors-allow-origin <CORS_ALLOW_ORIGIN>
+          [env: CORS_ALLOW_ORIGIN=]
+```
+
 ### Docker Images
 
 Text Embeddings Inference ships with multiple Docker images that you can use to target a specific backend:
diff --git a/backends/candle/src/compute_cap.rs b/backends/candle/src/compute_cap.rs
@@ -1,17 +1,19 @@
+use candle::cuda_backend::cudarc::driver::sys::CUdevice_attribute::{
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+};
+use candle::cuda_backend::cudarc::driver::CudaDevice;
 use lazy_static::lazy_static;
 
 lazy_static! {
     pub static ref RUNTIME_COMPUTE_CAP: usize = {
-        let out = std::process::Command::new("nvidia-smi")
-            .arg("--query-gpu=compute_cap")
-            .arg("--format=csv")
-            .output()
+        let device = CudaDevice::new(0).expect("cuda is not available");
+        let major = device
+            .attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)
             .unwrap();
-        let out = std::str::from_utf8(&out.stdout).unwrap();
-        let mut lines = out.lines();
-        assert_eq!(lines.next().unwrap(), "compute_cap");
-        let cap = lines.next().unwrap().replace('.', "");
-        cap.parse::<usize>().unwrap()
+        let minor = device
+            .attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)
+            .unwrap();
+        (major * 10 + minor) as usize
     };
     pub static ref COMPILE_COMPUTE_CAP: usize = env!("CUDA_COMPUTE_CAP").parse::<usize>().unwrap();
 }
diff --git a/eval.py b/eval.py
diff --git a/router/src/main.rs b/router/src/main.rs
@@ -60,26 +60,21 @@ struct Args {
     /// of the available hardware.
     ///
     /// This represents the total amount of potential tokens within a batch.
-    /// When using padding (not recommended) this would be equivalent of
-    /// `batch_size` * `max_total_tokens`.
     ///
-    /// However in the non-padded (flash attention) version this can be much finer.
-    ///
-    /// For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100`
+    /// For `max_batch_tokens=1000`, you could fit `10` queries of `total_tokens=100`
     /// or a single query of `1000` tokens.
     ///
     /// Overall this number should be the largest possible until the model is compute bound.
-    /// Since the actual memory overhead depends on other parameters like if you're flash attention
-    /// or the model implementation, text-embeddings-inference cannot infer this number
-    /// automatically.
+    /// Since the actual memory overhead depends on the model implementation,
+    /// text-embeddings-inference cannot infer this number automatically.
     #[clap(default_value = "16384", long, env)]
     max_batch_tokens: usize,
 
     /// Optionally control the maximum number of individual requests in a batch
     #[clap(long, env)]
     max_batch_requests: Option<usize>,
 
-    /// Control the maximum number of inputs that a client can send
+    /// Control the maximum number of inputs that a client can send in a single request
     #[clap(default_value = "32", long, env)]
     max_client_batch_size: usize,