MeteoSwiss
diff --git a/‎integration_tests/small1_test.py‎
Lines changed: 2 additions & 1 deletion b/‎integration_tests/small1_test.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎packages/common/src/weathergen/common/io.py‎
Lines changed: 0 additions & 1 deletion b/‎packages/common/src/weathergen/common/io.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎packages/common/src/weathergen/common/platform_env.py‎
Lines changed: 38 additions & 0 deletions b/‎packages/common/src/weathergen/common/platform_env.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎packages/evaluate/pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎packages/evaluate/pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎packages/evaluate/src/weathergen/evaluate/plot_utils.py‎
Lines changed: 2 additions & 2 deletions b/‎packages/evaluate/src/weathergen/evaluate/plot_utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎packages/evaluate/src/weathergen/evaluate/run_evaluation.py‎
Lines changed: 69 additions & 4 deletions b/‎packages/evaluate/src/weathergen/evaluate/run_evaluation.py‎
Lines changed: 69 additions & 4 deletions
diff --git a/‎packages/metrics/pyproject.toml‎
Lines changed: 102 additions & 0 deletions b/‎packages/metrics/pyproject.toml‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎packages/metrics/src/weathergen/metrics/__init__.py‎ b/‎packages/metrics/src/weathergen/metrics/__init__.py‎
@@ -134,7 +134,8 @@ def evaluate_results(run_id):
             },
         }
     )
-    evaluate_from_config(cfg)
+    # Not passing the mlflow client for tests.
+    evaluate_from_config(cfg, None)
 
 
 def load_metrics(run_id):
 
@@ -83,7 +83,6 @@ def combine(cls, others: list["IOReaderData"]) -> "IOReaderData":
 
         others is list of ReaderData instances.
         """
-
         assert len(others) > 0, len(others)
 
         other = others[0]
 
@@ -0,0 +1,38 @@
+"""
+Platform environment configuration for WeatherGenerator.
+
+These are loaded from secrets in the private repository.
+"""
+
+import importlib
+import importlib.util
+from functools import lru_cache
+from typing import Protocol
+
+from weathergen.common.config import _REPO_ROOT
+
+
+class PlatformEnv(Protocol):
+    """
+    Interface for platform environment configuration.
+    """
+
+    def get_hpc(self) -> str | None: ...
+
+    def get_hpc_user(self) -> str | None: ...
+
+    def get_hpc_config(self) -> str | None: ...
+
+    def get_hpc_certificate(self) -> str | None: ...
+
+
+@lru_cache(maxsize=1)
+def get_platform_env() -> PlatformEnv:
+    """
+    Loads the platform environment module from the private repository.
+    """
+    env_script_path = _REPO_ROOT.parent / "WeatherGenerator-private" / "hpc" / "platform-env.py"
+    spec = importlib.util.spec_from_file_location("platform_env", env_script_path)
+    platform_env = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(platform_env)  # type: ignore
+    return platform_env  # type: ignore
@@ -10,8 +10,9 @@ dependencies = [
     "xhistogram",
     "panel",
     "omegaconf",
-    "weathergen-common",
     "plotly>=6.2.0",
+    "weathergen-common",
+    "weathergen-metrics",
 ]
 
 [dependency-groups]
 
@@ -30,7 +30,7 @@ def collect_streams(runs: dict):
     return sorted({s for run in runs.values() for s in run["streams"].keys()})
 
 
-def collect_channels(scores_dict: dict, metric: str, region: str, runs) -> dict:
+def collect_channels(scores_dict: dict, metric: str, region: str, runs) -> list[str]:
     """Get all unique channels available for given metric and region across runs.
 
     Parameters
@@ -56,7 +56,7 @@ def collect_channels(scores_dict: dict, metric: str, region: str, runs) -> dict:
             if run_id not in run_data:
                 continue
             values = run_data[run_id]["channel"].values
-            channels.update(np.atleast_1d(values))
+            channels.update([str(x) for x in np.atleast_1d(values)])
     return list(channels)
 
 
 
@@ -3,6 +3,7 @@
 # dependencies = [
 #   "weathergen-evaluate",
 #   "weathergen-common",
+#   "weathergen-metrics",
 # ]
 # [tool.uv.sources]
 # weathergen-evaluate = { path = "../../../../../packages/evaluate" }
@@ -14,36 +15,57 @@
 from collections import defaultdict
 from pathlib import Path
 
+import mlflow
+from mlflow.client import MlflowClient
 from omegaconf import OmegaConf
+from xarray import DataArray
 
 from weathergen.common.config import _REPO_ROOT
+from weathergen.common.platform_env import get_platform_env
 from weathergen.evaluate.io_reader import WeatherGenReader
+from weathergen.evaluate.plot_utils import collect_channels
 from weathergen.evaluate.utils import (
     calc_scores_per_stream,
     metric_list_to_json,
     plot_data,
     plot_summary,
     retrieve_metric_from_json,
 )
+from weathergen.metrics.mlflow_utils import (
+    MlFlowUpload,
+    get_or_create_mlflow_parent_run,
+    log_scores,
+    setup_mlflow,
+)
 
 _logger = logging.getLogger(__name__)
 
 _DEFAULT_PLOT_DIR = _REPO_ROOT / "plots"
 
+_platform_env = get_platform_env()
+
 
 def evaluate() -> None:
     # By default, arguments from the command line are read.
     evaluate_from_args(sys.argv[1:])
 
 
 def evaluate_from_args(argl: list[str]) -> None:
+    # configure logging
+    logging.basicConfig(level=logging.INFO)
     parser = argparse.ArgumentParser(description="Fast evaluation of WeatherGenerator runs.")
     parser.add_argument(
         "--config",
         type=str,
         default=None,
         help="Path to the configuration yaml file for plotting. e.g. config/plottig_config.yaml",
     )
+    parser.add_argument(
+        "--push-metrics",
+        required=False,
+        action="store_true",
+        help="(optional) Upload scores to MLFlow.",
+    )
 
     args = parser.parse_args(argl)
     if args.config:
@@ -53,13 +75,19 @@ def evaluate_from_args(argl: list[str]) -> None:
             "No config file provided, using the default template config (please edit accordingly)"
         )
         config = Path(_REPO_ROOT / "config" / "evaluate" / "eval_config.yml")
-    evaluate_from_config(OmegaConf.load(config))
+    mlflow_client: MlflowClient | None = None
+    if args.push_metrics:
+        hpc_conf = _platform_env.get_hpc_config()
+        assert hpc_conf is not None
+        private_home = Path(hpc_conf)
+        private_cf = OmegaConf.load(private_home)
+        mlflow_client = setup_mlflow(private_cf)
+        _logger.info(f"MLFlow client set up: {mlflow_client}")
 
+    evaluate_from_config(OmegaConf.load(config), mlflow_client)
 
-def evaluate_from_config(cfg):
-    # configure logging
-    logging.basicConfig(level=logging.INFO)
 
+def evaluate_from_config(cfg, mlflow_client: MlflowClient | None) -> None:
     # load configuration
 
     runs = cfg.run_ids
@@ -149,6 +177,43 @@ def evaluate_from_config(cfg):
                             {"metric": metric}
                         )
 
+    if mlflow_client:
+        # Reorder scores_dict to push to MLFlow per run_id:
+        # Create a new defaultdict with the target structure: [run_id][metric][region][stream]
+        reordered_dict: dict[str, dict[str, dict[str, dict[str, DataArray]]]] = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(dict))
+        )
+
+        # Iterate through the original dictionary to get all keys and the final value
+        for metric, regions_dict in scores_dict.items():
+            for region, streams_dict in regions_dict.items():
+                for stream, runs_dict in streams_dict.items():
+                    for run_id, final_dict in runs_dict.items():
+                        # Assign the final_dict to the new structure using the reordered keys
+                        reordered_dict[run_id][metric][region][stream] = final_dict
+
+        channels_set = collect_channels(scores_dict, metric, region, runs)
+
+        for run_id, run in runs.items():
+            reader = WeatherGenReader(run, run_id, private_paths)
+            from_run_id = reader.inference_cfg["from_run_id"]
+            parent_run = get_or_create_mlflow_parent_run(mlflow_client, from_run_id)
+            _logger.info(f"MLFlow parent run: {parent_run}")
+            phase = "eval"
+            with mlflow.start_run(run_id=parent_run.info.run_id):
+                with mlflow.start_run(
+                    run_name=f"{phase}_{from_run_id}_{run_id}",
+                    parent_run_id=parent_run.info.run_id,
+                    nested=True,
+                ) as run:
+                    mlflow.set_tags(MlFlowUpload.run_tags(run_id, phase, from_run_id))
+                    log_scores(
+                        reordered_dict[run_id],
+                        mlflow_client,
+                        run.info.run_id,
+                        channels_set,
+                    )
+
     # plot summary
     if scores_dict and cfg.evaluation.get("summary_plots", True):
         _logger.info("Started creating summary plots..")
 
@@ -0,0 +1,102 @@
+[project]
+name = "weathergen-metrics"
+version = "0.1.0"
+description = "The WeatherGenerator Machine Learning Earth System Model"
+readme = "../../README.md"
+requires-python = ">=3.12,<3.13"
+dependencies = [
+    "mlflow-skinny",
+    "weathergen-common",
+]
+
+[dependency-groups]
+dev = [
+ "pytest~=8.3.5",
+ "pytest-mock>=3.14.1",
+ "ruff==0.9.7",
+ "pyrefly==0.36.0",
+]
+
+
+[tool.pyrefly]
+project-includes = ["src/"]
+project-excludes = [
+]
+
+[tool.pyrefly.errors]
+bad-argument-type = false
+unsupported-operation = false
+missing-attribute = false
+no-matching-overload = false
+bad-context-manager = false
+
+# To do:
+bad-assignment = false
+bad-return = false
+index-error = false
+not-iterable = false
+not-callable = false
+
+
+
+
+# The linting configuration
+[tool.ruff]
+
+# Wide rows
+line-length = 100
+
+[tool.ruff.lint]
+# All disabled until the code is formatted.
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # Banned imports
+    "TID",
+    # Naming conventions
+    "N",
+    # print
+    "T201"
+]
+
+# These rules are sensible and should be enabled at a later stage.
+ignore = [
+  # "B006",
+  "B011",
+  "UP008",
+  "SIM117",
+  "SIM118",
+  "SIM102",
+  "SIM401",
+  # To ignore, not relevant for us
+  "SIM108", # in case additional norm layer supports are added in future
+  "N817", # we use heavy acronyms, e.g., allowing 'import LongModuleName as LMN' (LMN is accepted)
+  "E731", # overly restrictive and less readable code
+  "N812", # prevents us following the convention for importing torch.nn.functional as F
+]
+
+[tool.ruff.lint.flake8-tidy-imports.banned-api]
+"numpy.ndarray".msg = "Do not use 'ndarray' to describe a numpy array type, it is a function. Use numpy.typing.NDArray or numpy.typing.NDArray[np.float32] for example"
+
+[tool.ruff.format]
+# Use Unix `\n` line endings for all files
+line-ending = "lf"
+
+
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/weathergen"]
Original file line number	Diff line number	Diff line change
`@@ -134,7 +134,8 @@ def evaluate_results(run_id):`
`134`	`134`	`},`
`135`	`135`	`}`
`136`	`136`	`)`
`137`		`- evaluate_from_config(cfg)`
	`137`	`+ # Not passing the mlflow client for tests.`
	`138`	`+ evaluate_from_config(cfg, None)`
`138`	`139`
`139`	`140`
`140`	`141`	`def load_metrics(run_id):`