relly on driver not on nvidia-smi

OlivierDehaene · OlivierDehaene · commit 2defbccdff24 · 2023-10-13T19:39:31.000+02:00
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -169,8 +169,8 @@
              DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
            tags: ${{ steps.meta-75.outputs.tags }}
            labels: ${{ steps.meta-75.outputs.labels }}
-           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
-           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
+           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-75,mode=min
+           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-75,mode=min
        - name: Build and push Docker image
          id: build-and-push-80
          uses: docker/build-push-action@v4
@@ -185,8 +185,8 @@
              DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
            tags: ${{ steps.meta-80.outputs.tags }}
            labels: ${{ steps.meta-80.outputs.labels }}
-           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
-           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
+           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-80,mode=min
+           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-80,mode=min
        - name: Build and push Docker image
          id: build-and-push-86
          uses: docker/build-push-action@v4
@@ -201,8 +201,8 @@
              DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
            tags: ${{ steps.meta-86.outputs.tags }}
            labels: ${{ steps.meta-86.outputs.labels }}
-           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
-           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
+           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-86,mode=min
+           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-86,mode=min
        - name: Build and push Docker image
          id: build-and-push-90
          uses: docker/build-push-action@v4
@@ -217,8 +217,8 @@
              DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
            tags: ${{ steps.meta-90.outputs.tags }}
            labels: ${{ steps.meta-90.outputs.labels }}
-           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
-           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
+           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-90,mode=min
+           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-90,mode=min
        - name: Build and push Docker image
          id: build-and-push-cpu
          uses: docker/build-push-action@v4
@@ -232,8 +232,8 @@
              DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
            tags: ${{ steps.meta-cpu.outputs.tags }}
            labels: ${{ steps.meta-cpu.outputs.labels }}
-           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
-           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache,mode=max
+           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-cpu,mode=min
+           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-cpu,mode=min
 
    stop-runner:
      name: Stop self-hosted EC2 runner
diff --git a/README.md b/README.md
@@ -89,10 +89,110 @@ curl 127.0.0.1:8080/embed \
 We also recommend using NVIDIA drivers with CUDA version 12 or higher. 
 
 To see all options to serve your models:
-```
+
+```shell
 text-embeddings-router --help
 ```
 
+```
+Usage: text-embeddings-router [OPTIONS]
+
+Options:
+      --model-id <MODEL_ID>
+          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `thenlper/gte-base`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
+
+          [env: MODEL_ID=]
+          [default: thenlper/gte-base]
+
+      --revision <REVISION>
+          The actual revision of the model if you are referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
+
+          [env: REVISION=]
+
+      --tokenization-workers <TOKENIZATION_WORKERS>
+          The number of tokenizer workers used for payload validation and truncation inside the router
+
+          [env: TOKENIZATION_WORKERS=]
+          [default: 8]
+
+      --dtype <DTYPE>
+          The dtype to be forced upon the model
+
+          [env: DTYPE=]
+          [default: float16]
+          [possible values: float16, float32]
+
+      --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
+          The maximum amount of concurrent requests for this particular deployment. Having a low limit will refuse clients requests instead of having them wait for too long and is usually good to handle backpressure correctly
+
+          [env: MAX_CONCURRENT_REQUESTS=]
+          [default: 512]
+
+      --max-batch-tokens <MAX_BATCH_TOKENS>
+          **IMPORTANT** This is one critical control to allow maximum usage of the available hardware.
+
+          This represents the total amount of potential tokens within a batch. When using padding (not recommended) this would be equivalent of `batch_size` * `max_total_tokens`.
+
+          However in the non-padded (flash attention) version this can be much finer.
+
+          For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
+
+          Overall this number should be the largest possible until the model is compute bound. Since the actual memory overhead depends on other parameters like if you are flash attention or the model implementation, text-embeddings cannot infer this number automatically.
+
+          [env: MAX_BATCH_TOKENS=]
+          [default: 8192]
+
+      --max-batch-requests <MAX_BATCH_REQUESTS>
+          Optionally control the maximum number of individual requests in a batch
+
+          [env: MAX_BATCH_REQUESTS=]
+
+      --max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
+          Control the maximum number of inputs that a client can send
+
+          [env: MAX_CLIENT_BATCH_SIZE=]
+          [default: 32]
+
+      --hf-api-token <HF_API_TOKEN>
+          Your HuggingFace hub token
+
+          [env: HF_API_TOKEN=]
+
+      --hostname <HOSTNAME>
+          The IP address to listen on
+
+          [env: HOSTNAME=]
+          [default: 0.0.0.0]
+
+  -p, --port <PORT>
+          The port to listen on
+
+          [env: PORT=]
+          [default: 3000]
+
+      --uds-path <UDS_PATH>
+          The name of the unix socket some text-embeddings backends will use as they communicate internally with gRPC
+
+          [env: UDS_PATH=]
+          [default: /tmp/text-embeddings-server]
+
+      --huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>
+          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
+
+          [env: HUGGINGFACE_HUB_CACHE=/data]
+
+      --json-output
+          Outputs the logs in JSON format (useful for telemetry)
+
+          [env: JSON_OUTPUT=]
+
+      --otlp-endpoint <OTLP_ENDPOINT>
+          [env: OTLP_ENDPOINT=]
+
+      --cors-allow-origin <CORS_ALLOW_ORIGIN>
+          [env: CORS_ALLOW_ORIGIN=]
+```
+
 ### Docker Images
 
 Text Embeddings Inference ships with multiple Docker images that you can use to target a specific backend:
diff --git a/backends/candle/src/compute_cap.rs b/backends/candle/src/compute_cap.rs
@@ -1,17 +1,19 @@
+use candle::cuda_backend::cudarc::driver::sys::CUdevice_attribute::{
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+};
+use candle::cuda_backend::cudarc::driver::CudaDevice;
 use lazy_static::lazy_static;
 
 lazy_static! {
     pub static ref RUNTIME_COMPUTE_CAP: usize = {
-        let out = std::process::Command::new("nvidia-smi")
-            .arg("--query-gpu=compute_cap")
-            .arg("--format=csv")
-            .output()
+        let device = CudaDevice::new(0).expect("cuda is not available");
+        let major = device
+            .attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)
             .unwrap();
-        let out = std::str::from_utf8(&out.stdout).unwrap();
-        let mut lines = out.lines();
-        assert_eq!(lines.next().unwrap(), "compute_cap");
-        let cap = lines.next().unwrap().replace('.', "");
-        cap.parse::<usize>().unwrap()
+        let minor = device
+            .attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)
+            .unwrap();
+        (major * 10 + minor) as usize
     };
     pub static ref COMPILE_COMPUTE_CAP: usize = env!("CUDA_COMPUTE_CAP").parse::<usize>().unwrap();
 }