MeteoSwiss
diff --git a/‎graphs/tests/test_create.py‎
Lines changed: 1 addition & 1 deletion b/‎graphs/tests/test_create.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎models/CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions b/‎models/CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎models/pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎models/pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎models/pytest.ini‎
Lines changed: 7 additions & 0 deletions b/‎models/pytest.ini‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎models/src/anemoi/models/layers/attention.py‎
Lines changed: 211 additions & 26 deletions b/‎models/src/anemoi/models/layers/attention.py‎
Lines changed: 211 additions & 26 deletions
diff --git a/‎models/src/anemoi/models/layers/block.py‎
Lines changed: 6 additions & 0 deletions b/‎models/src/anemoi/models/layers/block.py‎
Lines changed: 6 additions & 0 deletions
@@ -51,6 +51,6 @@ def test_generate_graph(self, config_file: tuple[Path, str], mock_grids_path: tu
 
         if graph_path is not None:
             assert graph_path.exists()
-            graph_saved = torch.load(graph_path)
+            graph_saved = torch.load(graph_path, weights_only=False)
             assert graph.node_types == graph_saved.node_types
             assert graph.edge_types == graph_saved.edge_types
@@ -54,15 +54,24 @@ Keep it human-readable, your future self will thank you!
 
 ### Added
 
+- CI workflow to update the changelog on release
+- add configurability of flash attention (#47)
+- configurabilty of the dropout probability in the the MultiHeadSelfAttention module
 - CI workflow to update the changelog on release
 - Remapper: Preprocessor for remapping one variable to multiple ones. Includes changes to the data indices since the remapper changes the number of variables. With optional config keywords.
+- Codeowners file
+- Pygrep precommit hooks
+- Docsig precommit hooks
+- Changelog merge strategy
+
 
 ### Changed
 
 - Update CI to inherit from common infrastructue reusable workflows
 - run downstream-ci only when src and tests folders have changed
 - New error messages for wrongs graphs.
 - Feature: Change model to be instantiatable in the interface, addressing [#28](https://github.com/ecmwf/anemoi-models/issues/28) through [#45](https://github.com/ecmwf/anemoi-models/pulls/45)
+- Bugfixes for CI
 
 ### Removed
 
 
@@ -46,7 +46,7 @@ dependencies = [
   "anemoi-utils>=0.1.9",
   "einops>=0.6.1",
   "hydra-core>=1.3",
-  "torch>=2.2",
+  "torch>=2.5",
   "torch-geometric>=2.3,<2.5",
 ]
 optional-dependencies.all = [  ]
 
@@ -0,0 +1,7 @@
+[pytest]
+markers =
+    data_dependent: marks tests depending on data (deselect with '-m "not data_dependent"')
+    auth: marks tests that require authentication (deselect with '-m "not auth"')
+    gpu: marks tests that require a GPU (deselect with '-m "not gpu"')
+
+tmp_path_retention_policy = none
@@ -8,23 +8,19 @@
 # nor does it submit to any jurisdiction.
 
 
+from __future__ import annotations
+
 import logging
+import math
 from typing import Optional
 
 import einops
+import torch
+from packaging import version
 from torch import Tensor
 from torch import nn
 from torch.distributed.distributed_c10d import ProcessGroup
 
-try:
-    from flash_attn import flash_attn_func as attn_func
-except ImportError:
-    from torch.nn.functional import scaled_dot_product_attention as attn_func
-
-    _FLASH_ATTENTION_AVAILABLE = False
-else:
-    _FLASH_ATTENTION_AVAILABLE = True
-
 from anemoi.models.distributed.transformer import shard_heads
 from anemoi.models.distributed.transformer import shard_sequence
 from anemoi.utils.config import DotDict
@@ -33,7 +29,12 @@
 
 
 class MultiHeadSelfAttention(nn.Module):
-    """Multi Head Self Attention Pytorch Layer."""
+    """Multi Head Self Attention Pytorch Layer
+
+    allows for three different attention implementations:
+    - scaled dot product attention, see https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+    - flash attention, see https://github.com/Dao-AILab/flash-attention
+    """
 
     def __init__(
         self,
@@ -44,32 +45,89 @@ def __init__(
         is_causal: bool = False,
         window_size: Optional[int] = None,
         dropout_p: float = 0.0,
+        attention_implementation: str = "flash_attention",
+        softcap: Optional[float] = None,
+        use_alibi_slopes: bool = False,
     ):
+        """Initialize MultiHeadSelfAttention.
+
+        For the flash attention implementation, two additional parameters are available: softcap, use_alibi_slopes
+
+        softcap: Softcapping prevents the logits from growing excessively large
+
+        use_alibi_slopes: Adds bias of `(-alibi_slope * |i + seqlen_k - seqlen_q - j|)` to the attention score of
+        query i and key j, where alibi_slope is calculated using get_alibi_slopes
+
+        Parameters
+        ----------
+        num_heads : int
+            number of heads
+        embed_dim : int
+            embedding dimension
+        bias : bool, optional
+            bias, by default False
+        is_causal : bool, optional
+            apply causal attention mask, by default False
+        window_size : Optional[int], optional
+            window_size, by default None
+        dropout_p : float, optional
+            dropout probability, by default 0.0
+        attention_implementation: str, optional
+            A predefined string which selects which underlying attention
+            implementation, by default "flash_attention"
+        softcap : float, optional
+            Anything > 0 activates softcapping attention, by default None
+        use_alibi_slopes : bool, optional
+            Adds bias
+        """
         super().__init__()
 
         assert (
             embed_dim % num_heads == 0
         ), f"Embedding dimension ({embed_dim}) must be divisible by number of heads ({num_heads})"
 
+        self.attention_implementation = attention_implementation
+        self.use_alibi_slopes = use_alibi_slopes
+
         self.num_heads = num_heads
         self.embed_dim = embed_dim
         self.head_dim = embed_dim // num_heads  # q k v
-        self.window_size = (window_size, window_size)  # flash attention
+        self.window_size = window_size
         self.dropout_p = dropout_p
         self.is_causal = is_causal
+        self.softcap = softcap
+
+        self.set_attention_function()
+
+        if self.use_alibi_slopes:
+            self.alibi_slopes = get_alibi_slopes(num_heads)
+            assert self.alibi_slopes.shape[0] == num_heads, "Error: Number of alibi_slopes must match number of heads"
+        else:
+            self.alibi_slopes = None
 
         linear = layer_kernels["Linear"]
         self.lin_qkv = linear(embed_dim, 3 * embed_dim, bias=bias)
-        self.attention = attn_func
-
-        if not _FLASH_ATTENTION_AVAILABLE:
-            LOGGER.warning("Flash attention not available, falling back to pytorch scaled_dot_product_attention")
 
         self.projection = linear(embed_dim, embed_dim, bias=True)
 
+    def set_attention_function(self):
+        attn_funcs = {
+            "flash_attention": FlashAttentionWrapper,
+            "scaled_dot_product_attention": SDPAAttentionWrapper,
+        }
+        assert (
+            self.attention_implementation in attn_funcs
+        ), f"{self.attention_implementation} not supported. \
+              Please change model.processor.attention_implementation to one of: {attn_funcs.keys()}"
+        LOGGER.info(f"Using {self.attention_implementation}")
+
+        # initalise the attn func here
+        self.attention = attn_funcs[self.attention_implementation]()
+
     def forward(
         self, x: Tensor, shapes: list, batch_size: int, model_comm_group: Optional[ProcessGroup] = None
     ) -> Tensor:
+
         query, key, value = self.lin_qkv(x).chunk(3, -1)
 
         if model_comm_group:
@@ -92,24 +150,151 @@ def forward(
         value = shard_heads(value, shapes=shapes, mgroup=model_comm_group)
         dropout_p = self.dropout_p if self.training else 0.0
 
-        if _FLASH_ATTENTION_AVAILABLE:
-            query, key, value = (
-                einops.rearrange(t, "batch heads grid vars -> batch grid heads vars") for t in (query, key, value)
+        out = self.attention(
+            query,
+            key,
+            value,
+            batch_size,
+            causal=False,
+            window_size=self.window_size,
+            dropout_p=dropout_p,
+            softcap=self.softcap,
+            alibi_slopes=self.alibi_slopes,
+        )
+
+        out = shard_sequence(out, shapes=shapes, mgroup=model_comm_group)
+        out = einops.rearrange(out, "batch heads grid vars -> (batch grid) (heads vars)")
+
+        out = self.projection(out)
+
+        return out
+
+
+class SDPAAttentionWrapper(nn.Module):
+    """Wrapper for Pytorch scaled dot product attention"""
+
+    def __init__(self):
+        super().__init__()
+
+        from torch.nn.functional import scaled_dot_product_attention
+
+        self.attention = scaled_dot_product_attention
+        self.mask = None
+        self.window_size = None
+
+    def update_mask(self, seq_len, window_size: int, device: str):
+
+        self.mask = (
+            torch.abs(
+                torch.arange(seq_len, device=device).unsqueeze(0) - torch.arange(seq_len, device=device).unsqueeze(1)
             )
-            out = self.attention(query, key, value, causal=False, window_size=self.window_size, dropout_p=dropout_p)
-            out = einops.rearrange(out, "batch grid heads vars -> batch heads grid vars")
-        else:
+            <= window_size
+        )
+
+    def forward(
+        self,
+        query,
+        key,
+        value,
+        batch_size: int,
+        causal=False,
+        window_size=None,
+        dropout_p=0.0,
+        softcap=None,
+        alibi_slopes=None,
+    ):
+        if softcap is not None:
+            NotImplementedError(
+                "Softcap not supported by Pytorchs SDPA. please switch to flash attention or disable softcap."
+            )
+        if alibi_slopes is not None:
+            NotImplementedError(
+                "Alibi slopes not supported by Pytorchs SDPA. please switch to flash attention or disable alibi slopes."
+            )
+
+        sequence_len = query.shape[-2]
+
+        if window_size is not None and (self.mask is None or tuple(self.mask.shape) != (sequence_len, sequence_len)):
+            self.update_mask(sequence_len, window_size=window_size, device=query.device)
+
+        with torch.nn.attention.sdpa_kernel(backends=[torch.nn.attention.SDPBackend.MATH]):
             out = self.attention(
                 query,
                 key,
                 value,
-                is_causal=False,
+                attn_mask=self.mask,
+                is_causal=causal,
                 dropout_p=dropout_p,
-            )  # expects (batch heads grid variable) format
+            )
 
-        out = shard_sequence(out, shapes=shapes, mgroup=model_comm_group)
-        out = einops.rearrange(out, "batch heads grid vars -> (batch grid) (heads vars)")
+        return out
 
-        out = self.projection(out)
 
+class FlashAttentionWrapper(nn.Module):
+    """Wrapper for Flash attention."""
+
+    def __init__(self):
+        super().__init__()
+        try:
+            import flash_attn
+        except ImportError:
+            raise ImportError("Error: Flash-attn not installed. Please install flash-attn to use Flash Attention")
+
+        if version.parse(flash_attn.__version__) < version.parse("2.6.0"):
+            raise RuntimeError("Error: Flash-attn version is too low. Update to 2.6.0 or higher.")
+        else:
+            self.attention = flash_attn.flash_attn_func
+
+    def forward(
+        self,
+        query,
+        key,
+        value,
+        batch_size: int,
+        causal: bool = False,
+        window_size: int = None,
+        dropout_p: float = 0.0,
+        softcap: Optional[float] = None,
+        alibi_slopes: torch.Tensor = None,
+    ):
+        query, key, value = (
+            einops.rearrange(t, "batch heads grid vars -> batch grid heads vars") for t in (query, key, value)
+        )
+
+        alibi_slopes = alibi_slopes.repeat(batch_size, 1).to(query.device) if alibi_slopes is not None else None
+
+        out = self.attention(
+            query,
+            key,
+            value,
+            causal=False,
+            window_size=(window_size, window_size),
+            dropout_p=dropout_p,
+            softcap=softcap,
+            alibi_slopes=alibi_slopes,
+        )
+        out = einops.rearrange(out, "batch grid heads vars -> batch heads grid vars")
         return out
+
+
+def get_alibi_slopes(num_heads: int) -> Tensor:
+    """Calculates linearly decreasing slopes for alibi attention.
+
+    Parameters
+    ----------
+    num_heads : int
+        number of attention heads
+
+    Returns
+    -------
+    Tensor
+        aLiBi slopes
+    """
+    n = 2 ** math.floor(math.log2(num_heads))
+    slope_0 = 2 ** (-8 / n)
+    alibi_slopes = torch.pow(slope_0, torch.arange(1, 1 + n))
+    if n < num_heads:
+        slope_hat_0 = 2 ** (-4 / n)
+        alibi_slopes_hat = torch.pow(slope_hat_0, torch.arange(1, 1 + 2 * (num_heads - n), 2))
+        alibi_slopes = torch.cat([alibi_slopes, alibi_slopes_hat])
+    return alibi_slopes
@@ -71,6 +71,9 @@ def __init__(
         window_size: int,
         layer_kernels: DotDict,
         dropout_p: float = 0.0,
+        attention_implementation: str = "flash_attention",
+        softcap: float = None,
+        use_alibi_slopes: bool = None,
     ):
         super().__init__()
 
@@ -91,6 +94,9 @@ def __init__(
             is_causal=False,
             dropout_p=dropout_p,
             layer_kernels=layer_kernels,
+            attention_implementation=attention_implementation,
+            softcap=softcap,
+            use_alibi_slopes=use_alibi_slopes,
         )
 
         self.mlp = nn.Sequential(
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ dependencies = [`
`46`	`46`	`"anemoi-utils>=0.1.9",`
`47`	`47`	`"einops>=0.6.1",`
`48`	`48`	`"hydra-core>=1.3",`
`49`		`- "torch>=2.2",`
	`49`	`+ "torch>=2.5",`
`50`	`50`	`"torch-geometric>=2.3,<2.5",`
`51`	`51`	`]`
`52`	`52`	`optional-dependencies.all = [ ]`