feat: USE_FLASH_ATTENTION env var (#57)

OlivierDehaene · web-flow · commit 463329a9bbf6 · 2023-10-31T11:53:39.000+01:00
diff --git a/.github/workflows/build_75.yaml b/.github/workflows/build_75.yaml
@@ -130,6 +130,7 @@
              CUDA_COMPUTE_CAP=75
              GIT_SHA=${{ env.GITHUB_SHA }}
              DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
+             DEFAULT_USE_FLASH_ATTENTION=False
            tags: ${{ steps.meta-75.outputs.tags }}
            labels: ${{ steps.meta-75.outputs.labels }}
            cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-75,mode=max
diff --git a/Dockerfile-cuda b/Dockerfile-cuda
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS base-builder
+FROM nvidia/cuda:12.0.0-devel-ubuntu22.04 AS base-builder
 
 ENV SCCACHE=0.5.4
 ENV RUSTC_WRAPPER=/usr/local/bin/sccache
@@ -77,10 +77,13 @@ RUN if [ ${CUDA_COMPUTE_CAP} -ge 75 -a ${CUDA_COMPUTE_CAP} -lt 80 ]; \
         cargo build --release --bin text-embeddings-router -F candle-cuda -F static-linking --no-default-features && sccache -s; \
     fi;
 
-FROM nvidia/cuda:12.2.0-base-ubuntu22.04
+FROM nvidia/cuda:12.0.0-base-ubuntu22.04
+
+ARG DEFAULT_USE_FLASH_ATTENTION=True
 
 ENV HUGGINGFACE_HUB_CACHE=/data \
-    PORT=80
+    PORT=80 \
+    USE_FLASH_ATTENTION=$DEFAULT_USE_FLASH_ATTENTION
 
 COPY --from=builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router
 
diff --git a/README.md b/README.md
@@ -97,7 +97,7 @@ curl 127.0.0.1:8080/embed \
 ```
 
 **Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). 
-We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. 
+We also recommend using NVIDIA drivers with CUDA version 12.0 or higher. 
 
 To see all options to serve your models:
 
@@ -236,6 +236,9 @@ Text Embeddings Inference ships with multiple Docker images that you can use to
 | Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-0.3.0                    |
 | Hopper (H100)                       | ghcr.io/huggingface/text-embeddings-inference:hopper-0.3.0 (experimental) |
 
+**Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues. 
+You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable.
+
 ### API documentation
 
 You can consult the OpenAPI documentation of the `text-embeddings-inference` REST API using the `/docs` route.
@@ -307,7 +310,7 @@ sudo apt-get install libssl-dev gcc -y
 
 GPUs with Cuda compute capabilities < 7.5 are not supported (V100, Titan V, GTX 1000 series, ...).
 
-Make sure you have Cuda and the nvidia drivers installed. We recommend using NVIDIA drivers with CUDA version 12.2 or higher. 
+Make sure you have Cuda and the nvidia drivers installed. We recommend using NVIDIA drivers with CUDA version 12.0 or higher. 
 You also need to add the nvidia binaries to your path:
 
 ```shell
diff --git a/backends/candle/src/lib.rs b/backends/candle/src/lib.rs
@@ -132,9 +132,9 @@ impl CandleBackend {
                     if cfg!(any(feature = "flash-attn", feature = "flash-attn-v1"))
                         && dtype == DType::F16
                         && config.position_embedding_type == PositionEmbeddingType::Absolute
-                        // Flash attention v1 precision problem with head_size == 32
+                        // Allow disabling because of flash attention v1 precision problems
                         // See: https://github.com/huggingface/text-embeddings-inference/issues/37
-                        && !(*RUNTIME_COMPUTE_CAP == 75 && (config.hidden_size / config.num_attention_heads) == 32)
+                        && &std::env::var("USE_FLASH_ATTENTION").unwrap_or("True".to_string()).to_lowercase() == "true"
                     {
                         tracing::info!("Starting FlashBert model on Cuda");
                         Box::new(FlashBertModel::load(vb, &config, pool).s()?)