MeteoSwiss
diff --git a/‎src/weathergen/datasets/anemoi_dataset.py‎
Lines changed: 205 additions & 31 deletions b/‎src/weathergen/datasets/anemoi_dataset.py‎
Lines changed: 205 additions & 31 deletions
diff --git a/‎src/weathergen/datasets/batchifyer.py‎
Lines changed: 13 additions & 7 deletions b/‎src/weathergen/datasets/batchifyer.py‎
Lines changed: 13 additions & 7 deletions
@@ -11,6 +11,7 @@
 import logging
 
 import numpy as np
+import torch
 from anemoi.datasets import open_dataset
 
 _logger = logging.getLogger(__name__)
@@ -28,6 +29,29 @@ def __init__(
         filename: str,
         stream_info: dict,
     ) -> None:
+        """
+        Construct dataset based on anemoi dataset
+
+        Parameters
+        ----------
+        start : int
+            Start time
+        end : int
+            End time
+        len_hrs : int
+            length of data window
+        step_hrs :
+            delta hours between start times of windows
+        filename :
+            filename (and path) of dataset
+        stream_info :
+            information about stream
+
+        Returns
+        -------
+        None
+        """
+
         # TODO: add support for different normalization modes
 
         assert len_hrs == step_hrs, "Currently only step_hrs=len_hrs is supported"
@@ -106,31 +130,69 @@ def __init__(
         else:
             self.ds = open_dataset(ds, frequency=str(step_hrs) + "h", start=dt_start, end=dt_end)
 
-    def __len__(self):
-        "Length of dataset"
+    def __len__(self) -> int:
+        """
+        Length of dataset
+
+        Parameters
+        ----------
+        None
 
+        Returns
+        -------
+        length of dataset
+        """
         if not self.ds:
             return 0
 
         return len(self.ds)
 
     def get_source(self, idx: int) -> tuple[np.array, np.array, np.array, np.array]:
         """
-        TODO
+        Get source data for idx
+
+        Parameters
+        ----------
+        idx : int
+            Index of temporal window
+
+        Returns
+        -------
+        source data (coords, geoinfos, data, datetimes)
         """
         return self._get(idx, self.source_idx)
 
     def get_target(self, idx: int) -> tuple[np.array, np.array, np.array, np.array]:
         """
-        TODO
+        Get target data for idx
+
+        Parameters
+        ----------
+        idx : int
+            Index of temporal window
+
+        Returns
+        -------
+        target data (coords, geoinfos, data, datetimes)
         """
         return self._get(idx, self.target_idx)
 
     def _get(
         self, idx: int, channels_idx: np.array
     ) -> tuple[np.array, np.array, np.array, np.array]:
         """
-        TODO
+        Get data for window
+
+        Parameters
+        ----------
+        idx : int
+            Index of temporal window
+        channels_idx : np.array
+            Selection of channels
+
+        Returns
+        -------
+        data (coords, geoinfos, data, datetimes)
         """
 
         if not self.ds:
@@ -172,74 +234,186 @@ def _get(
 
         return (latlon, geoinfos, data, datetimes)
 
-    def get_source_size(self):
-        """
-        TODO
+    def get_source_num_channels(self) -> int:
         """
-        return 2 + len(self.geoinfo_idx) + len(self.source_idx)
+        Get number of source channels
 
-    def get_source_num_channels(self):
-        """
-        TODO
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        number of source channels
         """
         return len(self.source_idx)
 
-    def get_target_size(self):
+    def get_target_num_channels(self) -> int:
         """
-        TODO
+        Get number of target channels
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        number of target channels
         """
-        return 2 + len(self.geoinfo_idx) + len(self.target_idx)
+        return len(self.target_idx)
 
-    def get_target_num_channels(self):
+    def get_coords_size(self) -> int:
         """
-        TODO
+        Get size of coords
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        size of coords
         """
-        return len(self.target_idx)
+        return 2
 
-    def get_geoinfo_size(self):
+    def get_geoinfo_size(self) -> int:
         """
-        TODO
+        Get size of geoinfos
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        size of geoinfos
         """
         return len(self.geoinfo_idx)
 
-    def normalize_coords(self, coords):
+    def normalize_coords(self, coords: torch.tensor) -> torch.tensor:
         """
-        TODO
+        Normalize coordinates
+
+        Parameters
+        ----------
+        coords :
+            coordinates to be normalized
+
+        Returns
+        -------
+        Normalized coordinates
         """
         coords[..., 0] = np.sin(np.deg2rad(coords[..., 0]))
         coords[..., 1] = np.sin(0.5 * np.deg2rad(coords[..., 1]))
 
         return coords
 
-    def normalize_geoinfos(self, geoinfos):
+    def normalize_geoinfos(self, geoinfos: torch.tensor) -> torch.tensor:
         """
-        TODO
+        Normalize geoinfos
+
+        Parameters
+        ----------
+        geoinfos :
+            geoinfos to be normalized
+
+        Returns
+        -------
+        Normalized geoinfo
         """
 
-        assert geoinfos.shape[-1] == 0
+        assert geoinfos.shape[-1] == 0, "incorrect number of geoinfo channels"
         return geoinfos
 
-    def normalize_source_channels(self, source):
+    def normalize_source_channels(self, source: torch.tensor) -> torch.tensor:
         """
-        TODO
+        Normalize source channels
+
+        Parameters
+        ----------
+        data :
+            data to be normalized
+
+        Returns
+        -------
+        Normalized data
         """
-        assert source.shape[1] == len(self.source_idx)
+        assert source.shape[-1] == len(self.source_idx), "incorrect number of channels"
         for i, ch in enumerate(self.source_idx):
             source[..., i] = (source[..., i] - self.mean[ch]) / self.stdev[ch]
 
         return source
 
-    def normalize_target_channels(self, target):
+    def normalize_target_channels(self, target: torch.tensor) -> torch.tensor:
         """
-        TODO
+        Normalize target channels
+
+        Parameters
+        ----------
+        data :
+            data to be normalized
+
+        Returns
+        -------
+        Normalized data
         """
-        assert target.shape[1] == len(self.target_idx)
+        assert target.shape[-1] == len(self.target_idx), "incorrect number of channels"
         for i, ch in enumerate(self.target_idx):
             target[..., i] = (target[..., i] - self.mean[ch]) / self.stdev[ch]
 
         return target
 
+    def denormalize_source_channels(self, source: torch.tensor) -> torch.tensor:
+        """
+        Denormalize source channels
+
+        Parameters
+        ----------
+        data :
+            data to be denormalized
+
+        Returns
+        -------
+        Denormalized data
+        """
+        assert source.shape[-1] == len(self.source_idx), "incorrect number of channels"
+        for i, ch in enumerate(self.source_idx):
+            source[..., i] = (source[..., i] * self.stdev[ch]) + self.mean[ch]
+
+        return source
+
+    def denormalize_target_channels(self, data: torch.tensor) -> torch.tensor:
+        """
+        Denormalize target channels
+
+        Parameters
+        ----------
+        data :
+            data to be denormalized (target or pred)
+
+        Returns
+        -------
+        Denormalized data
+        """
+        assert data.shape[-1] == len(self.target_idx), "incorrect number of channels"
+        for i, ch in enumerate(self.target_idx):
+            data[..., i] = (data[..., i] * self.stdev[ch]) + self.mean[ch]
+
+        return data
+
     def time_window(self, idx: int) -> tuple[np.datetime64, np.datetime64]:
+        """
+        Temporal window corresponding to index
+
+        Parameters
+        ----------
+        idx :
+            index of temporal window
+
+        Returns
+        -------
+            start and end of temporal window
+        """
         if not self.ds:
             return (np.array([], dtype=np.datetime64), np.array([], dtype=np.datetime64))
 
 
@@ -406,19 +406,25 @@ def batchify_target(
             )
             hpy_idxs_ord_split = np.split(hpy_idxs_ord, splits + 1)
 
-            times = encode_times_target(times, time_win)
+            times_enc = encode_times_target(times, time_win)
 
             target_tokens = [torch.tensor([]) for _ in range(self.num_healpix_cells_target)]
+            target_coords_raw = [torch.tensor([]) for _ in range(self.num_healpix_cells_target)]
             target_coords = [torch.tensor([]) for _ in range(self.num_healpix_cells_target)]
             target_geoinfos = [torch.tensor([]) for _ in range(self.num_healpix_cells_target)]
+            target_times_raw = [torch.tensor([]) for _ in range(self.num_healpix_cells_target)]
             target_times = [torch.tensor([]) for _ in range(self.num_healpix_cells_target)]
             for i, c in enumerate(cells_idxs):
                 t = normalizer.normalize_target_channels(source[hpy_idxs_ord_split[i]])
-                t = t[self.rng.permutation(len(t))][: int(len(t) * sampling_rate_target)]
-                target_tokens[c] = t
-                target_coords[c] = coords[hpy_idxs_ord_split[i]]
-                target_geoinfos[c] = normalizer.normalize_geoinfos(geoinfos[hpy_idxs_ord_split[i]])
-                target_times[c] = times[hpy_idxs_ord_split[i]]
+                perm = self.rng.permutation(len(t))[: int(len(t) * sampling_rate_target)]
+                target_tokens[c] = t[perm]
+                target_coords[c] = coords[hpy_idxs_ord_split[i]][perm]
+                target_coords_raw[c] = coords[hpy_idxs_ord_split[i]][perm]
+                target_geoinfos[c] = normalizer.normalize_geoinfos(
+                    geoinfos[hpy_idxs_ord_split[i]][perm]
+                )
+                target_times_raw[c] = times[hpy_idxs_ord_split[i]][perm]
+                target_times[c] = times_enc[hpy_idxs_ord_split[i]][perm]
 
             target_tokens_lens = torch.tensor([len(s) for s in target_tokens], dtype=torch.int32)
 
@@ -436,4 +442,4 @@ def batchify_target(
                 target_coords.requires_grad = False
                 target_coords = list(target_coords.split(target_tokens_lens.tolist()))
 
-        return (target_tokens, target_coords)
+        return (target_tokens, target_coords, target_coords_raw, target_times_raw)