fix: associate output stream names with correct index (ecmwf#519)

grassesi · web-flow · commit 511f03641145 · 2025-07-28T16:05:08.000+02:00
* fix: associate output stream names with correct index

* ruffed

* fix: iteration over output items

* address comments

* fix: correctly index channels

* fix stream indexing logic, add asserts

* fix: extraction of data/coordinates for sources

* fix assert
diff --git a/packages/common/src/weathergen/common/io.py b/packages/common/src/weathergen/common/io.py
@@ -240,7 +240,8 @@ def forecast_steps(self) -> list[int]:
 class OutputBatchData:
     """Provide convenient access to adapt existing output data structures."""
 
-    # sample, stream, tensor(datapoint, channel) => datapoints is accross all datasets per stream
+    # sample, stream, tensor(datapoint, channel+coords)
+    # => datapoints is accross all datasets per stream
     sources: list[list]
 
     # fstep, stream, redundant dim (size 1), tensor(sample x datapoint, channel)
@@ -258,7 +259,8 @@ class OutputBatchData:
     # fstep, stream, redundant dim (size 1)
     targets_lens: list[list[list[int]]]
 
-    stream_names: list[str]
+    # stream name: index into data (only streams in analysis_streams_output)
+    streams: dict[str, int]
 
     # stream, channel name
     channels: list[list[str]]
@@ -279,22 +281,23 @@ def forecast_steps(self):
 
     def items(self) -> typing.Generator[OutputItem, None, None]:
         """Iterate over possible output items"""
-        filtered_streams = (stream for stream in self.stream_names if stream != "")
         # TODO: filter for empty items?
-        for s, fo_s, fi_s in itertools.product(self.samples, self.forecast_steps, filtered_streams):
+        for s, fo_s, fi_s in itertools.product(
+            self.samples, self.forecast_steps, self.streams.keys()
+        ):
             yield self.extract(ItemKey(int(s), int(fo_s), fi_s))
 
     def extract(self, key: ItemKey) -> OutputItem:
         """Extract datasets from lists for one output item."""
         # adjust shifted values in ItemMeta
         sample = key.sample - self.sample_start
         forecast_step = key.forecast_step - self.forecast_offset
-        stream_idx = self.stream_names.index(key.stream)  # TODO: assure this is correct
+        stream_idx = self.streams[key.stream]
         lens = self.targets_lens[forecast_step][stream_idx]
         start = sum(lens[:sample])
         n_samples = lens[sample]
 
-        _logger.info("extracting subset")
+        _logger.info(f"extracting subset: {key}")
         _logger.info(
             f"sample: start:{self.sample_start} rel_idx:{sample} range:{start}-{start + n_samples}"
         )
@@ -331,18 +334,40 @@ def extract(self, key: ItemKey) -> OutputItem:
         channels = self.channels[stream_idx]
         geoinfo_channels = self.geoinfo_channels[stream_idx]
 
+        assert len(channels) == target_data.shape[1], (
+            "Number of channel names does not align with data"
+        )
+        assert len(channels) == preds_data.shape[1], (
+            "Number of channel names does not align with data"
+        )
+
         if key.with_source:
             source_data = self.sources[sample][stream_idx].cpu().detach().numpy()
+
+            # split data into coords, geoinfo, channels
+            _source_coords = source_data[:, : -len(channels)]
+            source_coords = _source_coords[:, :2]
+            source_times = _source_coords[:, 2]
+            source_geoinfo = _source_coords[:, 2 : -len(channels)]
+
+            # TODO asserts that times, coords, geoinfos should match?
+
             source_dataset = OutputDataset(
                 "source",
                 key,
-                source_data,
-                times,
-                coords,
-                geoinfo,
+                source_data[:, -len(channels) :],
+                source_times,
+                source_coords,
+                source_geoinfo,
                 channels,
                 geoinfo_channels,
             )
+
+            _logger.info(f"source shape: {source_dataset.data.shape}")
+            assert len(channels) == source_dataset.data.shape[1], (
+                "Number of channel names does not align with data"
+            )
+            assert len(geoinfo_channels) == source_dataset.geoinfo.shape[1]
         else:
             source_dataset = None
 
diff --git a/src/weathergen/utils/validation_io.py b/src/weathergen/utils/validation_io.py
@@ -26,36 +26,35 @@ def write_output(
     targets_times_all,
     targets_lens,
 ):
-    if cf.analysis_streams_output is None:
-        output_stream_names = [stream.name for stream in cf.streams]
-        _logger.info(f"Using all streams as output streams: {output_stream_names}")
-    else:
-        output_stream_names = [
-            stream.name for stream in cf.streams if stream.name in cf.analysis_streams_output
-        ]
-    _logger.info(f"Using output streams: {output_stream_names}")
-    # TODO: streams anemoi `source`, `target` commented out???
+    stream_names = [stream.name for stream in cf.streams]
+    output_stream_names = cf.analysis_streams_output
+    if output_stream_names is None:
+        output_stream_names = stream_names
 
-    channels: list[list[str]] = [
-        list(stream.val_target_channels)
-        for stream in cf.streams
-        if stream.name in output_stream_names
-    ]
+    output_streams = {name: stream_names.index(name) for name in output_stream_names}
+
+    _logger.info(f"Using output streams: {output_streams} from streams: {stream_names}")
+
+    channels: list[list[str]] = [list(stream.val_target_channels) for stream in cf.streams]
 
     geoinfo_channels = [[] for _ in cf.streams]  # TODO obtain channels
 
     # assume: is batch size guarnteed and constant:
     # => calculate global sample indices for this batch by offsetting by sample_start
     sample_start = batch_idx * cf.batch_size_validation_per_gpu
 
+    assert len(stream_names) == len(targets_all[0]), "data does not match number of streams"
+    assert len(stream_names) == len(preds_all[0]), "data does not match number of streams"
+    assert len(stream_names) == len(sources[0]), "data does not match number of streams"
+
     data = io.OutputBatchData(
         sources,
         targets_all,
         preds_all,
         targets_coords_all,
         targets_times_all,
         targets_lens,
-        output_stream_names,
+        output_streams,
         channels,
         geoinfo_channels,
         sample_start,