feat: Log variable scaling in mlflow (ecmwf#327)

jakob-schloer · pre-commit-ci[bot] · anaprietonem · web-flow · commit 1c4a96663c0b · 2025-10-06T15:33:58.000+02:00
## Description The variable scaling of the loss function is not trivial anymore since many scalers can be multiplied. To keep track of that, the final variable scaling is logged to mlfow (or other loggers) hyperparameters. ![image](https://github.com/user-attachments/assets/1450c624-9038-441c-9923-5c61bd770120) ## Type of Change - [x] New feature (non-breaking change which adds functionality) ## Code Compatibility - [x] I have performed a self-review of my code - [ ] Test with wandb and other loggers ### Code Performance and Testing - [x] I ran the [complete Pytest test](https://anemoi.readthedocs.io/projects/training/en/latest/dev/testing.html) suite locally, and they pass - [x] I have tested the changes on a single GPU - [x] I have tested the changes on multiple GPUs / multi-node setups - [ ] I have run the Benchmark Profiler against the old version of the code ### Documentation - [x] My code follows the style guidelines of this project - [x] I have added comments to my code, particularly in hard-to-understand areas  ---- 📚 Documentation preview 📚: https://anemoi-training--327.org.readthedocs.build/en/327/   ---- 📚 Documentation preview 📚: https://anemoi-graphs--327.org.readthedocs.build/en/327/   ---- 📚 Documentation preview 📚: https://anemoi-models--327.org.readthedocs.build/en/327/  --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ana Prieto Nemesio <91897203+anaprietonem@users.noreply.github.com>
diff --git a/training/src/anemoi/training/diagnostics/logger.py b/training/src/anemoi/training/diagnostics/logger.py
@@ -17,7 +17,6 @@
 from omegaconf import OmegaConf
 
 from anemoi.training.schemas.base_schema import BaseSchema
-from anemoi.training.schemas.base_schema import convert_to_omegaconf
 
 LOGGER = logging.getLogger(__name__)
 
@@ -91,11 +90,6 @@ def get_mlflow_logger(config: BaseSchema) -> None:
         on_resume_create_child=config.diagnostics.log.mlflow.on_resume_create_child,
         max_params_length=max_params_length,
     )
-    config_params = OmegaConf.to_container(convert_to_omegaconf(config), resolve=True)
-    logger.log_hyperparams(
-        config_params,
-        expand_keys=config.diagnostics.log.mlflow.expand_hyperparams,
-    )
 
     if config.diagnostics.log.mlflow.terminal:
         logger.log_terminal_output(artifact_save_dir=config.hardware.paths.plots)
diff --git a/training/src/anemoi/training/losses/utils.py b/training/src/anemoi/training/losses/utils.py
@@ -8,27 +8,48 @@
 # nor does it submit to any jurisdiction.
 
 
+from __future__ import annotations
+
 import logging
+from typing import TYPE_CHECKING
 
-from anemoi.models.data_indices.collection import IndexCollection
-from anemoi.training.losses.base import BaseLoss
 from anemoi.training.utils.enums import TensorDim
 
+if TYPE_CHECKING:
+    from anemoi.models.data_indices.collection import IndexCollection
+    from anemoi.training.losses.base import BaseLoss
+
 LOGGER = logging.getLogger(__name__)
 
 
-def print_variable_scaling(loss: BaseLoss, data_indices: IndexCollection) -> None:
-    """Log the final variable scaling for each variable in the model.
+def print_variable_scaling(loss: BaseLoss, data_indices: IndexCollection) -> dict[str, float]:
+    """Log the final variable scaling for each variable in the model and return the scaling values.
 
     Parameters
     ----------
     loss : BaseLoss
         Loss function to get the variable scaling from.
     data_indices : IndexCollection
         Index collection to get the variable names from.
+
+    Returns
+    -------
+    Dict[str, float]
+        Dictionary mapping variable names to their scaling values. If max_variables is specified,
+        only the top N variables plus 'total_sum' will be included.
     """
     variable_scaling = loss.scaler.subset_by_dim(TensorDim.VARIABLE.value).get_scaler(len(TensorDim)).reshape(-1)
     log_text = "Final Variable Scaling: "
+    scaling_values, scaling_sum = {}, 0.0
+
     for idx, name in enumerate(data_indices.model.output.name_to_index.keys()):
-        log_text += f"{name}: {variable_scaling[idx]:.4g}, "
+        value = float(variable_scaling[idx])
+        log_text += f"{name}: {value:.4g}, "
+        scaling_values[name] = value
+        scaling_sum += value
+
+    log_text += f"Total scaling sum: {scaling_sum:.4g}, "
+    scaling_values["total_sum"] = scaling_sum
     LOGGER.debug(log_text)
+
+    return scaling_values
diff --git a/training/src/anemoi/training/train/tasks/base.py b/training/src/anemoi/training/train/tasks/base.py
@@ -18,6 +18,7 @@
 import pytorch_lightning as pl
 import torch
 from hydra.utils import instantiate
+from omegaconf import OmegaConf
 from timm.scheduler import CosineLRScheduler
 from torch.distributed.optim import ZeroRedundancyOptimizer
 
@@ -212,7 +213,10 @@ def __init__(
             scalers=self.scalers,
             data_indices=self.data_indices,
         )
-        print_variable_scaling(self.loss, data_indices)
+        self._scaling_values_log = print_variable_scaling(
+            self.loss,
+            data_indices,
+        )
 
         self.metrics = torch.nn.ModuleDict(
             {
@@ -761,3 +765,22 @@ def configure_optimizers(self) -> tuple[list[torch.optim.Optimizer], list[dict]]
         )
 
         return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
+
+    def setup(self, stage: str) -> None:
+        """Lightning hook that is called after model is initialized but before training starts."""
+        # The conditions should be separate, but are combined due to pre-commit hook
+        if stage == "fit" and self.trainer.is_global_zero and self.logger is not None:
+            # Log hyperparameters on rank 0
+            hyper_params = OmegaConf.to_container(convert_to_omegaconf(self.config), resolve=True)
+            hyper_params.update({"variable_loss_scaling": self._scaling_values_log})
+            # Expand keys for better visibility
+            expand_keys = OmegaConf.select(
+                convert_to_omegaconf(self.config),
+                "diagnostics.log.mlflow.expand_hyperparams",
+                default=["config"],
+            )
+            # Log hyperparameters
+            self.logger.log_hyperparams(
+                hyper_params,
+                expand_keys=expand_keys,
+            )