MeteoSwiss
diff --git a/‎models/README.md‎
Lines changed: 1 addition & 1 deletion b/‎models/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎models/docs/introduction/overview.rst‎
Lines changed: 2 additions & 2 deletions b/‎models/docs/introduction/overview.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎models/src/anemoi/models/distributed/shapes.py‎
Lines changed: 7 additions & 0 deletions b/‎models/src/anemoi/models/distributed/shapes.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎models/src/anemoi/models/layers/bounding.py‎
Lines changed: 54 additions & 0 deletions b/‎models/src/anemoi/models/layers/bounding.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎models/src/anemoi/models/layers/mapper.py‎
Lines changed: 54 additions & 6 deletions b/‎models/src/anemoi/models/layers/mapper.py‎
Lines changed: 54 additions & 6 deletions
diff --git a/‎models/src/anemoi/models/layers/truncation.py‎
Lines changed: 185 additions & 0 deletions b/‎models/src/anemoi/models/layers/truncation.py‎
Lines changed: 185 additions & 0 deletions
@@ -7,7 +7,7 @@ This project is **BETA** and will be **Experimental** for the foreseeable future
 Interfaces and functionality are likely to change, and the project itself may be scrapped.
 **DO NOT** use this software in any project/software that is operational.
 
-Miscellanous tools for training data-driven weather forecasts.
+Miscellanous tools for training data-driven weather forecasting models.
 
 ## Documentation
 
 
@@ -87,8 +87,8 @@ to process the input data.
 The layers are designed as extensible classes to allow for easy
 experimentation and switching out of components.
 
-Mappers
-=======
+Graph Mappers
+=============
 
 The layers implement `Mappers`, which maps data between the input grid
 and the internal hidden grid. The `Mappers` are used as encoder and
 
@@ -38,3 +38,10 @@ def apply_shard_shapes(tensor: Tensor, dim: int, shard_shapes_dim: list) -> list
         shard_shapes[i][dim] = shard_shape
 
     return shard_shapes
+
+
+def get_or_apply_shard_shapes(x, dim=0, shard_shapes_dim: int = None, model_comm_group: Optional[ProcessGroup] = None):
+    if shard_shapes_dim is None:
+        return get_shard_shapes(x, dim, model_comm_group)
+    else:
+        return apply_shard_shapes(x, dim, shard_shapes_dim)
@@ -11,9 +11,12 @@
 
 from abc import ABC
 from abc import abstractmethod
+from typing import Any
+from typing import Iterable
 from typing import Optional
 
 import torch
+from hydra.utils import instantiate
 from torch import nn
 
 from anemoi.models.data_indices.tensor import InputTensorIndex
@@ -301,3 +304,54 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         # Calculate the fraction of the total variable
         x[..., self.data_index] *= x[..., self.total_variable]
         return x
+
+
+def build_boundings(
+    model_config: Any,
+    data_indices: Any,
+    statistics: dict | None,
+) -> nn.ModuleList:
+    """Build the list of model-output bounding modules from configuration.
+
+    This is a thin factory over Hydra's ``instantiate`` that reads the iterable
+    ``model_config.model.bounding`` and instantiates each entry while injecting
+    the common keyword arguments required by bounding modules:
+    ``name_to_index``, ``statistics``, and ``name_to_index_stats``. The result
+    is returned as an ``nn.ModuleList`` preserving the order of the config.
+
+    Parameters
+    ----------
+    model_config : Any
+        Object with a ``model`` attribute containing an iterable ``bounding``
+        (e.g. a list of Hydra configs). If absent or empty, an empty
+        ``nn.ModuleList`` is returned.
+    data_indices : Any
+        Object providing the mappings:
+        ``data_indices.model.output.name_to_index`` and
+        ``data_indices.data.input.name_to_index``. These are forwarded to each
+        instantiated bounding module as ``name_to_index`` and
+        ``name_to_index_stats`` respectively.
+    statistics : dict | None
+        Optional dataset/model statistics passed to each bounding module. Use
+        ``None`` if not required by the configured classes.
+
+    Returns
+    -------
+    torch.nn.ModuleList
+        The instantiated bounding modules, in the same order as specified in
+        ``model_config.model.bounding``. May be empty.
+    """
+
+    bounding_cfgs: Iterable[Any] = getattr(getattr(model_config, "model", object()), "bounding", []) or []
+
+    return nn.ModuleList(
+        [
+            instantiate(
+                cfg,
+                name_to_index=data_indices.model.output.name_to_index,
+                statistics=statistics,
+                name_to_index_stats=data_indices.data.input.name_to_index,
+            )
+            for cfg in bounding_cfgs
+        ]
+    )
@@ -400,7 +400,7 @@ def run_processor_chunk_edge_sharding(
 
         return self.post_process(x_dst_out, shapes[1], model_comm_group, keep_x_dst_sharded=True)
 
-    def forward_with_edge_sharding(
+    def mapper_forward_with_edge_sharding(
         self,
         x: PairTensor,
         batch_size: int,
@@ -453,7 +453,7 @@ def forward_with_edge_sharding(
 
         return out_dst
 
-    def forward_with_heads_sharding(
+    def mapper_forward_with_heads_sharding(
         self,
         x: PairTensor,
         batch_size: int,
@@ -513,9 +513,9 @@ def forward(
         }
 
         if self.shard_strategy == "edges":
-            return self.forward_with_edge_sharding(**kwargs_forward)
+            return self.mapper_forward_with_edge_sharding(**kwargs_forward)
         else:  # self.shard_strategy == "heads"
-            return self.forward_with_heads_sharding(**kwargs_forward)
+            return checkpoint(self.mapper_forward_with_heads_sharding, **kwargs_forward, use_reentrant=False)
 
 
 class GraphTransformerForwardMapper(ForwardMapperPreProcessMixin, GraphTransformerBaseMapper):
@@ -818,7 +818,7 @@ def prepare_edges(
         edge_attr = self.emb_edges(edge_attr)
         return edge_attr, edge_index
 
-    def forward(
+    def mapper_forward(
         self,
         x: PairTensor,
         batch_size: int,
@@ -852,6 +852,30 @@ def forward(
 
         return x_src, x_dst
 
+    def forward(
+        self,
+        x: PairTensor,
+        batch_size: int,
+        shard_shapes: tuple[tuple[int], tuple[int]],
+        model_comm_group: Optional[ProcessGroup] = None,
+        x_src_is_sharded: bool = False,
+        x_dst_is_sharded: bool = False,
+        keep_x_dst_sharded: bool = False,
+        **kwargs,
+    ) -> PairTensor:
+        return checkpoint(
+            self.mapper_forward,
+            x=x,
+            batch_size=batch_size,
+            shard_shapes=shard_shapes,
+            model_comm_group=model_comm_group,
+            x_src_is_sharded=x_src_is_sharded,
+            x_dst_is_sharded=x_dst_is_sharded,
+            keep_x_dst_sharded=keep_x_dst_sharded,
+            **kwargs,
+            use_reentrant=False,
+        )
+
 
 class GNNForwardMapper(ForwardMapperPreProcessMixin, GNNBaseMapper):
     """Graph Neural Network Mapper data -> hidden."""
@@ -1155,7 +1179,7 @@ def __init__(
 
         self.emb_nodes_dst = nn.Linear(self.in_channels_dst, self.hidden_dim)
 
-    def forward(
+    def mapper_forward(
         self,
         x: PairTensor,
         batch_size: int,
@@ -1181,6 +1205,30 @@ def forward(
 
         return x_dst
 
+    def forward(
+        self,
+        x: PairTensor,
+        batch_size: int,
+        shard_shapes: tuple[tuple[int], tuple[int]],
+        model_comm_group: Optional[ProcessGroup] = None,
+        x_src_is_sharded: bool = False,
+        x_dst_is_sharded: bool = False,
+        keep_x_dst_sharded: bool = False,
+        **kwargs,
+    ) -> PairTensor:
+        return checkpoint(
+            self.mapper_forward,
+            x=x,
+            batch_size=batch_size,
+            shard_shapes=shard_shapes,
+            model_comm_group=model_comm_group,
+            x_src_is_sharded=x_src_is_sharded,
+            x_dst_is_sharded=x_dst_is_sharded,
+            keep_x_dst_sharded=keep_x_dst_sharded,
+            **kwargs,
+            use_reentrant=False,
+        )
+
 
 class TransformerForwardMapper(ForwardMapperPreProcessMixin, TransformerBaseMapper):
     """Transformer Mapper from data -> hidden."""
 
@@ -0,0 +1,185 @@
+# (C) Copyright 2025 Anemoi contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+import logging
+
+import numpy as np
+import torch
+
+from anemoi.models.distributed.graph import gather_channels
+from anemoi.models.distributed.graph import shard_channels
+from anemoi.models.distributed.shapes import get_or_apply_shard_shapes
+
+LOGGER = logging.getLogger(__name__)
+
+
+class BaseTruncation:
+    """Apply resolution truncation/upsampling via sparse projection matrices.
+
+    This utility holds two (optional) sparse COO matrices:
+
+    - ``A_down``: projects from a high-resolution representation to a coarse one
+      (e.g., spectral/graph truncation).
+    - ``A_up``: projects from the coarse representation back to high resolution
+      (e.g., zero-padding or learned up-projection).
+
+    Both matrices are expected in SciPy CSR/COO-like format at construction time
+    and are converted to PyTorch sparse tensors. During ``__call__`` the
+    matrices are moved to the input device (first use) and applied per sample in
+    the batch. When inputs are grid-sharded across ranks, tensors are reshaped
+    to channel-sharding to apply the projection on the full sequence and then
+    restored to their original sharding scheme.
+
+    Notes
+    -----
+    - Sparse tensors are **not** registered as buffers because DDP does not
+      reliably broadcast sparse tensors; instead the matrices are lazily moved
+      to the correct device on first use.
+    - Matrix–tensor multiplication is performed as ``A @ X`` (left
+      multiplication), where ``A`` is sparse (``[n_out, n_in]``) and ``X`` is
+      dense (``[n_in, d]``), producing ``[n_out, d]``.
+    """
+
+    def __init__(self, truncation_data: dict) -> None:
+        """Build the truncation matrices.
+
+        Parameters
+        ----------
+        truncation_data : dict
+            Dictionary possibly containing keys ``"down"`` and/or ``"up"`` with
+            SciPy sparse matrices. ``"down"`` defines the high→coarse projection
+            (stored as ``A_down``); ``"up"`` defines the coarse→high projection
+            (stored as ``A_up``).
+        """
+        self.A_down, self.A_up = None, None
+        if "down" in truncation_data:
+            self.A_down = self._make_truncation_matrix(truncation_data["down"])
+            LOGGER.info("Truncation: A_down %s", self.A_down.shape)
+        if "up" in truncation_data:
+            self.A_up = self._make_truncation_matrix(truncation_data["up"])
+            LOGGER.info("Truncation: A_up %s", self.A_up.shape)
+
+    def _make_truncation_matrix(self, A, data_type=torch.float32):
+        """Convert a SciPy sparse matrix to a coalesced PyTorch COO tensor.
+
+        Parameters
+        ----------
+        A : scipy.sparse.spmatrix
+            Input sparse matrix with shape ``(n_out, n_in)``.
+        data_type : torch.dtype, optional
+            Target dtype for the tensor values, by default ``torch.float32``.
+
+        Returns
+        -------
+        torch.Tensor
+            A coalesced sparse COO tensor with the same shape as ``A``.
+        """
+        A_ = torch.sparse_coo_tensor(
+            torch.tensor(np.vstack(A.nonzero()), dtype=torch.long),
+            torch.tensor(A.data, dtype=data_type),
+            size=A.shape,
+        ).coalesce()
+        return A_
+
+    def _multiply_sparse(self, x, A):
+        """Left-multiply a dense matrix by a sparse projection.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Dense 2-D tensor with shape ``(n_in, d)``.
+        A : torch.Tensor
+            Sparse COO tensor with shape ``(n_out, n_in)``.
+
+        Returns
+        -------
+        torch.Tensor
+            Dense 2-D tensor with shape ``(n_out, d)`` equal to ``A @ x``.
+        """
+        return torch.sparse.mm(A, x)
+
+    def _truncate_fields(self, x, A, batch_size=None, auto_cast=False):
+        """Apply a sparse projection to each item in a batch.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Dense 3-D tensor with shape ``(B, n_in, d)``. For each batch item
+            ``i``, ``x[i]`` is multiplied as ``A @ x[i]``.
+        A : torch.Tensor
+            Sparse COO tensor with shape ``(n_out, n_in)``.
+        batch_size : int, optional
+            Number of batch elements to process. If ``None`` (default), uses
+            ``x.shape[0]``.
+        auto_cast : bool, optional
+            If ``True``, enables CUDA autocast for the multiplication loop.
+
+        Returns
+        -------
+        torch.Tensor
+            Dense 3-D tensor with shape ``(B, n_out, d)`` containing the
+            projected batch.
+        """
+        if not batch_size:
+            batch_size = x.shape[0]
+        out = []
+        with torch.amp.autocast(device_type="cuda", enabled=auto_cast):
+            for i in range(batch_size):
+                out.append(self._multiply_sparse(x[i, ...], A))
+        return torch.stack(out)
+
+    def __call__(self, x, grid_shard_shapes=None, model_comm_group=None):
+        """Apply down/up truncation to a (possibly sharded) batch.
+
+        This function optionally:
+        1) Reshapes grid-sharded inputs to channel-sharded layout to expose the
+           full sequence to the projection matrices.
+        2) Applies ``A_down`` (high→coarse) and/or ``A_up`` (coarse→high) per
+           batch element when provided.
+        3) Restores the original sharding layout.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input dense tensor of shape ``(B, n_in, d)`` if unsharded. When
+            grid-sharded, the leading dimensions depend on the sharding layout;
+            this method will handle reshaping internally.
+        grid_shard_shapes : Any, optional
+            Distributed shape metadata used to convert between grid and
+            channel sharding. If ``None``, no resharding is performed.
+        model_comm_group : Any, optional
+            Communication group handle used by distributed helpers.
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor with the same global shape semantics as ``x``. If
+            truncation matrices are present, the ``n_in`` dimension is replaced
+            by the corresponding ``n_out`` after projection.
+        """
+        if self.A_down is not None or self.A_up is not None:
+            if grid_shard_shapes is not None:
+                shard_shapes = get_or_apply_shard_shapes(x, 0, grid_shard_shapes, model_comm_group)
+                # grid-sharded input: reshard to channel-shards to apply truncation
+                x = shard_channels(x, shard_shapes, model_comm_group)  # we get the full sequence here
+
+            # these can't be registered as buffers because ddp does not like to broadcast sparse tensors
+            # hence we check that they are on the correct device ; copy should only happen in the first forward run
+            if self.A_down is not None:
+                self.A_down = self.A_down.to(x.device)
+                x = self._truncate_fields(x, self.A_down)  # to coarse resolution
+            if self.A_up is not None:
+                self.A_up = self.A_up.to(x.device)
+                x = self._truncate_fields(x, self.A_up)  # back to high resolution
+
+            if grid_shard_shapes is not None:
+                # back to grid-sharding as before
+                x = gather_channels(x, shard_shapes, model_comm_group)
+
+        return x