add layers

fxmarty · fxmarty · commit 8584b6d44807 · 2024-06-18T13:07:45.000Z
diff --git a/backends/python/server/text_embeddings_server/layers/__init__.py b/backends/python/server/text_embeddings_server/layers/__init__.py
diff --git a/backends/python/server/text_embeddings_server/layers/attention/__init__.py b/backends/python/server/text_embeddings_server/layers/attention/__init__.py
@@ -0,0 +1,11 @@
+from text_embeddings_server.utils.import_utils import SYSTEM
+import os
+
+if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
+    raise ImportError("`USE_FLASH_ATTENTION` is false.")
+if SYSTEM == "cuda":
+    from .cuda import attention
+elif SYSTEM == "rocm":
+    from .rocm import attention
+else:
+    raise ImportError(f"System {SYSTEM} doesn't support flash/paged attention")
diff --git a/backends/python/server/text_embeddings_server/layers/attention/cuda.py b/backends/python/server/text_embeddings_server/layers/attention/cuda.py
@@ -0,0 +1,92 @@
+import os
+import torch
+
+from loguru import logger
+
+if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
+    raise ImportError("`USE_FLASH_ATTENTION` is false.")
+
+if not torch.cuda.is_available():
+    raise ImportError("CUDA is not available")
+
+major, minor = torch.cuda.get_device_capability()
+is_sm75 = major == 7 and minor == 5
+is_sm8x = major == 8 and minor >= 0
+is_sm90 = major == 9 and minor == 0
+
+HAS_FLASH_ATTN = False
+HAS_FLASH_ATTN_V2 = False
+try:
+    try:
+        import flash_attn_2_cuda
+    except ImportError:
+        raise ImportError(
+            "Flash Attention V2 is not installed.\n"
+            "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
+            "or install flash attention v2 with `cd server && make install install-flash-attention-v2`"
+        )
+    if not (is_sm8x or is_sm90):
+        raise ImportError(
+            f"GPU with CUDA capability {major} {minor} is not supported for "
+            "Flash Attention V2"
+        )
+    HAS_FLASH_ATTN_V2 = True
+except ImportError as e:
+    try:
+        import flash_attn_cuda
+    except ImportError:
+        raise ImportError(
+            "Flash Attention is not installed.\n"
+            "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
+            "or install flash attention with `cd server && make install install-flash-attention`"
+        ) from e
+
+    if not (is_sm75 or is_sm8x or is_sm90):
+        raise ImportError(
+            f"GPU with CUDA capability {major} {minor} is not supported"
+        ) from e
+    logger.warning(f"Unable to use Flash Attention V2: {e}")
+    HAS_FLASH_ATTN = True
+
+
+def attention(q, k, v, out, cu_seqlens, max_s, softmax_scale, is_causal=False):
+    if HAS_FLASH_ATTN_V2:
+        return flash_attn_2_cuda.varlen_fwd(
+            q,
+            k,
+            v,
+            out,
+            cu_seqlens,
+            cu_seqlens,
+            max_s,
+            max_s,
+            0.0,
+            softmax_scale,
+            False,
+            is_causal,
+            -1,
+            -1,
+            False,
+            None,
+        )
+
+    if HAS_FLASH_ATTN:
+        return flash_attn_cuda.fwd(
+            q,
+            k,
+            v,
+            out,
+            cu_seqlens,
+            cu_seqlens,
+            max_s,
+            max_s,
+            0.0,
+            softmax_scale,
+            False,
+            is_causal,
+            False,
+            0,
+            None,
+        )
+
+    raise NotImplementedError("flash attention is not installed")
diff --git a/backends/python/server/text_embeddings_server/layers/attention/rocm.py b/backends/python/server/text_embeddings_server/layers/attention/rocm.py
@@ -0,0 +1,45 @@
+import os
+import torch
+from text_embeddings_server.utils.import_utils import SYSTEM
+from loguru import logger
+
+major, minor = torch.cuda.get_device_capability()
+is_sm75 = major == 7 and minor == 5
+
+if SYSTEM == "rocm":
+    try:
+        import flash_attn_2_cuda
+
+        logger.info("ROCm: using Flash Attention 2 Composable Kernel implementation.")
+    except ImportError as e:
+        if major >= 8 or is_sm75:
+            architecture_suffix = f"-{SYSTEM}"
+            raise ImportError(f"Flash Attention V2 is not installed. {e}")
+        else:
+            for idx in range(torch.cuda.device_count()):
+                name = torch.cuda.get_device_name(idx)
+                if "MI210" not in name and "MI250" not in name and "MI300" not in name:
+                    raise ImportError(
+                        f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
+                    )
+            raise ImportError(
+                f"AMD GPU with ROCm capability {major} {minor} is not supported"
+            ) from e
+
+def attention(q, k, v, out, cu_seqlens, max_s, softmax_scale, is_causal=False):
+    return flash_attn_2_cuda.varlen_fwd(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens,
+        cu_seqlens,
+        max_s,
+        max_s,
+        0.0,
+        softmax_scale,
+        False,
+        is_causal,
+        False,
+        None,
+    )
diff --git a/backends/python/server/text_embeddings_server/layers/layernorm.py b/backends/python/server/text_embeddings_server/layers/layernorm.py
@@ -0,0 +1,54 @@
+import torch
+from text_embeddings_server.utils.import_utils import SYSTEM
+
+from transformers.models.bert import BertConfig
+
+if SYSTEM == "cuda":
+    import dropout_layer_norm
+
+    class FastLayerNorm:
+        def __init__(self, prefix, handle, device, dtype, config: BertConfig):
+            self.weight = handle.get_tensor(f"{prefix}.weight").to(dtype).to(device)
+            self.bias = handle.get_tensor(f"{prefix}.bias").to(dtype).to(device)
+            self.variance_epsilon = config.layer_norm_eps
+
+        def forward(self, hidden_states, residual=None):
+            normed_hidden_states, residual, *rest = dropout_layer_norm.dropout_add_ln_fwd(
+                hidden_states,
+                residual,
+                self.weight,
+                self.bias,
+                None,
+                None,
+                None,
+                None,
+                0.0,
+                self.variance_epsilon,
+                1.0,
+                0,
+                None,
+                False,
+                False,
+            )
+            if residual is None:
+                residual = hidden_states
+
+            return normed_hidden_states, residual
+
+elif SYSTEM == "rocm":
+    class FastLayerNorm:
+        def __init__(self, prefix, handle, device, dtype, config: BertConfig):
+            self.weight = handle.get_tensor(f"{prefix}.weight").to(dtype).to(device)
+            self.bias = handle.get_tensor(f"{prefix}.bias").to(dtype).to(device)
+            self.variance_epsilon = config.layer_norm_eps
+        
+        def forward(self, hidden_states, residual=None):
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            hidden_states = torch.nn.functional.layer_norm(hidden_states, self.weight.shape, self.weight, self.bias, eps=self.variance_epsilon)
+
+            return hidden_states, residual
+else:
+    raise ValueError("System not recognized")
diff --git a/backends/python/server/text_embeddings_server/utils/import_utils.py b/backends/python/server/text_embeddings_server/utils/import_utils.py
@@ -0,0 +1,12 @@
+import torch
+from loguru import logger
+
+SYSTEM = None
+if torch.version.hip is not None:
+    SYSTEM = "rocm"
+elif torch.version.cuda is not None and torch.cuda.is_available():
+    SYSTEM = "cuda"
+else:
+    SYSTEM = "cpu"
+
+logger.info(f"Python backend: detected system {SYSTEM}")