[210] Handling nan's and other corner cases in the metrics file (ecmwf#248)

tjhunter · web-flow · commit d2947c63b2cd · 2025-05-14T16:30:33.000+02:00
* changes

* changes

* fixes
diff --git a/config/streams/streams_test/era5.yml b/config/streams/streams_test/era5.yml
@@ -21,11 +21,11 @@ ERA5 :
     net : transformer
     num_tokens : 1
     num_heads : 4
-    dim_embed : 128
+    dim_embed : 16
     num_blocks : 2
   embed_target_coords :
     net : linear
-    dim_embed : 128
+    dim_embed : 16
   target_readout :
     type : 'obs_value'  # token or obs_value
     num_layers : 2
diff --git a/integration_tests/small1.py b/integration_tests/small1.py
@@ -88,42 +88,42 @@ def assert_missing_metrics_file(run_id):
 
 
 def assert_train_loss_below_threshold(run_id):
-    """Test that the 'stream.era5.loss_mse.loss_avg' metric is below a threshold."""
+    """Test that the 'stream.ERA5.loss_mse.loss_avg' metric is below a threshold."""
     metrics = load_metrics(run_id)
     loss_metric = next(
         (
-            metric.get("stream.era5.loss_mse.loss_avg", None)
+            metric.get("stream.ERA5.loss_mse.loss_avg", None)
             for metric in reversed(metrics)
             if metric.get("stage") == "train"
         ),
         None,
     )
     assert loss_metric is not None, (
-        "'stream.era5.loss_mse.loss_avg' metric is missing in metrics file"
+        "'stream.ERA5.loss_mse.loss_avg' metric is missing in metrics file"
     )
     # Check that the loss does not explode in a single epoch
     # This is meant to be a quick test, not a convergence test
     assert loss_metric < 1.25, (
-        f"'stream.era5.loss_mse.loss_avg' is {loss_metric}, expected to be below 0.25"
+        f"'stream.ERA5.loss_mse.loss_avg' is {loss_metric}, expected to be below 0.25"
     )
 
 
 def assert_val_loss_below_threshold(run_id):
-    """Test that the 'stream.era5.loss_mse.loss_avg' metric is below a threshold."""
+    """Test that the 'stream.ERA5.loss_mse.loss_avg' metric is below a threshold."""
     metrics = load_metrics(run_id)
     loss_metric = next(
         (
-            metric.get("stream.era5.loss_mse.loss_avg", None)
+            metric.get("stream.ERA5.loss_mse.loss_avg", None)
             for metric in reversed(metrics)
             if metric.get("stage") == "val"
         ),
         None,
     )
     assert loss_metric is not None, (
-        "'stream.era5.loss_mse.loss_avg' metric is missing in metrics file"
+        "'stream.ERA5.loss_mse.loss_avg' metric is missing in metrics file"
     )
     # Check that the loss does not explode in a single epoch
     # This is meant to be a quick test, not a convergence test
     assert loss_metric < 1.25, (
-        f"'stream.era5.loss_mse.loss_avg' is {loss_metric}, expected to be below 0.25"
+        f"'stream.ERA5.loss_mse.loss_avg' is {loss_metric}, expected to be below 0.25"
     )
diff --git a/integration_tests/small1.yaml b/integration_tests/small1.yaml
@@ -10,5 +10,10 @@ lr_steps: 4
 lr_steps_warmup: 2
 lr_steps_cooldown: 2
 loader_num_workers: 1
+
+forecast_offset : 0
+# len_hrs: 6
+# step_hrs: 6
+
 train_log:
   log_interval: 1
diff --git a/src/weathergen/utils/metrics.py b/src/weathergen/utils/metrics.py
@@ -0,0 +1,48 @@
+"""
+Utilities related to reading and writing metrics.
+
+We use our own simple json-based format to abstract away various backends (our own pipeline, mlflow, wandb, etc.).
+"""
+
+import polars as pl
+
+# Known columns that are not scalar metrics:
+_known_cols = {"weathergen.timestamp": pl.Int64, "weathergen.time": pl.Int64, "stage": pl.String}
+
+
+def read_metrics_file(f: str) -> pl.DataFrame:
+    """
+    Loads a file of metrics.
+
+    The resulting dataframe has the following format:
+    - all columns in known_cols (if they exist in the file) have the right type
+    - all other columns are of type float64 (including NaN values)
+    """
+
+    # All values are scalar, except for known values
+    # The following point needs to be taken into account:
+    # 1. The schema is not known in advance
+    # 2. NaN is encoded as string
+    # 3. numbers are encoded as numbers
+    # The file needs to be read 3 times:
+    # 1. Get the name of all the columns
+    # 2. Find all the NaN values
+    # 3. Read the numbers
+    # 4. Merge the two dataframes
+
+    # Find the list of all columns (read everything)
+    df0 = pl.read_ndjson(f, infer_schema_length=None)
+    # Read with the final schema:
+    schema1 = dict([(n, _known_cols.get(n, pl.Float64)) for n in df0.columns])
+    df1 = pl.read_ndjson(f, schema=schema1)
+    # Read again as strings to find the NaN values:
+    schema2 = dict([(n, _known_cols.get(n, pl.String)) for n in df0.columns])
+    metrics_cols = [n for n in df0.columns if n not in _known_cols]
+    df2 = pl.read_ndjson(f, schema=schema2).cast(dict([(n, pl.Float64) for n in metrics_cols]))
+
+    # Merge the two dataframes:
+    for n in metrics_cols:
+        df1 = df1.with_columns(
+            pl.when(pl.col(n).is_not_nan()).then(df1[n]).otherwise(df2[n]).alias(n)
+        )
+    return df1
diff --git a/src/weathergen/utils/metrics_test.py b/src/weathergen/utils/metrics_test.py
@@ -0,0 +1,20 @@
+from io import StringIO
+from math import isnan
+
+from weathergen.utils.metrics import (
+    read_metrics_file,
+)
+
+s = """{"weathergen.timestamp":100, "m": "nan"}
+{"weathergen.timestamp":101,"m": 1.3}
+{"weathergen.timestamp":102,"a": 4}
+"""
+
+
+def test1():
+    df = read_metrics_file(StringIO(s))
+    assert df.shape == (3, 3)
+    assert df["weathergen.timestamp"].to_list() == [100, 101, 102]
+    assert isnan(df["m"].to_list()[0])
+    assert df["m"].to_list()[1:] == [1.3, None]
+    assert df["a"].to_list() == [None, None, 4]
diff --git a/src/weathergen/utils/train_logger.py b/src/weathergen/utils/train_logger.py
@@ -20,6 +20,7 @@
 import polars as pl
 
 import weathergen.utils.config as config
+from weathergen.utils.metrics import read_metrics_file
 
 _weathergen_timestamp = "weathergen.timestamp"
 _weathergen_reltime = "weathergen.reltime"
@@ -257,7 +258,7 @@ def read_metrics(
         run_id = cf.run_id
 
     # TODO: this should be a config option
-    df = pl.read_ndjson(f"./results/{run_id}/metrics.json")
+    df = read_metrics_file(f"./results/{run_id}/metrics.json")
     if stage is not None:
         df = df.filter(pl.col("stage") == stage)
     df = df.drop("stage")
@@ -294,7 +295,7 @@ def clean_df(df, columns: list[str] | None):
 
 def _clean_name(n: str) -> str:
     """Cleans the stream name to only retain alphanumeric characters"""
-    return "".join([c for c in n if c.isalnum()]).lower()
+    return "".join([c for c in n if c.isalnum()])
 
 
 def _key_loss(st_name: str, lf_name: str) -> str: