Removing some hardcoded paths and starting refactor (ecmwf#32)

tjhunter · Timothee Hunter · web-flow · commit de731a310772 · 2025-03-04T11:14:03.000+01:00
* reformating code

* cicd

* linting fixes

* format

* changes'

* work

* work

* work

* changes

* changed refactor settings

* misconfigured

* fixes

* fixes

* fixes

* reset bad formatting

* reset files

* reset files

* style

* changes

* comments

* changes

---------

Co-authored-by: Timothee Hunter &lt;ecm8774@ac6-100.bullx&gt;
diff --git a/config/streams/streams_anemoi/era5.yml b/config/streams/streams_anemoi/era5.yml
@@ -0,0 +1,36 @@
+# (C) Copyright 2024 WeatherGenerator contributors.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+ERA5 :
+  type : anemoi
+  filenames : ['aifs-ea-an-oper-0001-mars-o96-1979-2022-6h-v6.zarr']
+  loss_weight : 1.
+  source_variables : [null]
+  target_variables : [null]
+  diagnostic : False
+  masking_rate : 0.6
+  masking_rate_none : 0.05
+  token_size : 32
+  embed : 
+    net : transformer
+    num_tokens : 1
+    num_heads : 8
+    dim_embed : 256
+    num_blocks : 2
+  embed_target_coords :
+    net : linear
+    dim_embed : 256
+  target_readout :
+    type : 'obs_value'  # token or obs_value
+    num_layers : 2
+    num_heads : 4
+    # sampling_rate : 0.2
+  pred_head :
+    ens_size : 1
+    num_layers : 1
diff --git a/pyproject.toml b/pyproject.toml
@@ -44,6 +44,12 @@ dev = [
 ]
 
 
+[tool.black]
+
+# Wide rows
+line-length = 100
+
+
 # The linting configuration
 [tool.ruff]
 
diff --git a/src/weathergen/__init__.py b/src/weathergen/__init__.py
@@ -13,7 +13,8 @@
 import traceback
 
 from weathergen.train.trainer import Trainer
-from weathergen.utils.config import Config
+from weathergen.utils.config import Config, private_conf
+from weathergen.utils.logger import init_loggers
 
 
 ####################################################################################################
@@ -27,6 +28,8 @@ def evaluate(
     save_samples=True,
     gridded_output_streams=[],
 ):
+    # TODO: move somewhere else
+    init_loggers()
     # load config if specified
     cf = Config.load(run_id, epoch if epoch is not None else -1)
 
@@ -45,7 +48,7 @@ def evaluate(
     # cf.start_date_val = 202201010400
     # cf.end_date_val = 202301010400
 
-    cf.step_hrs = 12
+    # cf.step_hrs = 12
 
     cf.shuffle = shuffle
 
@@ -64,12 +67,15 @@ def evaluate(
 
 ####################################################################################################
 def train(run_id=None) -> None:
+    # TODO: move somewhere else
+    init_loggers()
+    private_cf = private_conf()
     cf = Config()
 
     # directory where input streams are specified
     # cf.streams_directory = './streams_large/'
-    # cf.streams_directory = './streams_anemoi/'
-    cf.streams_directory = "./streams_mixed/"
+    cf.streams_directory = "./config/streams/streams_anemoi/"
+    # cf.streams_directory = "./streams_mixed/"
 
     # embed_orientation : 'channels' or 'columns'
     # channels: embedding is per channel for a token (#tokens=num_channels)
@@ -175,7 +181,8 @@ def train(run_id=None) -> None:
     cf.norm_type = "LayerNorm"  #'LayerNorm' #'RMSNorm'
     cf.nn_module = "te"
 
-    cf.data_path = "/home/mlx/ai-ml/datasets/stable/"
+    cf.data_path = private_cf["data_path"]
+    # "/home/mlx/ai-ml/datasets/stable/"
     # cf.data_path = '/lus/h2resw01/fws4/lb/project/ai-ml/observations/v1'
     # cf.data_path = '/leonardo_scratch/large/userexternal/clessig0/obs/v1'
     cf.start_date = 201301010000
@@ -201,7 +208,7 @@ def train(run_id=None) -> None:
     trainer = Trainer(log_freq=20, checkpoint_freq=250, print_freq=10)
 
     try:
-        trainer.run(cf)
+        trainer.run(cf, private_cf)
     except:
         extype, value, tb = sys.exc_info()
         traceback.print_exc()
diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
@@ -35,6 +35,8 @@
 from weathergen.utils.train_logger import TrainLogger
 from weathergen.utils.validation_io import write_validation
 
+_logger = logging.getLogger(__name__)
+
 
 class Trainer(Trainer_Base):
     ###########################################
@@ -47,7 +49,14 @@ def __init__(self, log_freq=20, checkpoint_freq=250, print_freq=10):
         self.print_freq = print_freq
 
     ###########################################
-    def init(self, cf, run_id_contd=None, epoch_contd=None, run_id_new=False, run_mode="training"):
+    def init(
+        self,
+        cf,
+        run_id_contd=None,
+        epoch_contd=None,
+        run_id_new=False,
+        run_mode="training",
+    ):
         self.cf = cf
 
         if isinstance(run_id_new, str):
@@ -284,7 +293,7 @@ def evaluate_jac(self, cf, run_id, epoch, mode="row", date=None, obs_id=0, sampl
         )
 
     ###########################################
-    def run(self, cf, run_id_contd=None, epoch_contd=None, run_id_new=False):
+    def run(self, cf, private_cf, run_id_contd=None, epoch_contd=None, run_id_new=False):
         # general initalization
         self.init(cf, run_id_contd, epoch_contd, run_id_new)
 
@@ -419,18 +428,23 @@ def run(self, cf, run_id_contd=None, epoch_contd=None, run_id_new=False):
         )
         self.grad_scaler = torch.amp.GradScaler("cuda")
 
+        assert len(self.dataset) > 0, f"No data found in {self.dataset}"
+
         # lr is updated after each batch so account for this
+        # TODO: conf should be read-only, do not modify the conf in flight
         cf.lr_steps = int((len(self.dataset) * cf.num_epochs) / cf.batch_size)
+
         steps_decay = cf.lr_steps - cf.lr_steps_warmup - cf.lr_steps_cooldown
+        _logger.debug(f"steps_decay={steps_decay} lr_steps={cf.lr_steps}")
         # ensure that steps_decay has a reasonable value
         if steps_decay < int(0.2 * cf.lr_steps):
             cf.lr_steps_warmup = int(0.1 * cf.lr_steps)
             cf.lr_steps_cooldown = int(0.05 * cf.lr_steps)
             steps_decay = cf.lr_steps - cf.lr_steps_warmup - cf.lr_steps_cooldown
-            str = f"cf.lr_steps_warmup and cf.lr_steps_cooldown were larger than cf.lr_steps={cf.lr_steps}"
-            str += ". The value have been adjusted to cf.lr_steps_warmup={cf.lr_steps_warmup} and "
-            str += " cf.lr_steps_cooldown={cf.lr_steps_cooldown} so that steps_decay={steps_decay}."
-            logging.getLogger("obslearn").warning("")
+            s = f"cf.lr_steps_warmup and cf.lr_steps_cooldown were larger than cf.lr_steps={cf.lr_steps}"
+            s += f". The value have been adjusted to cf.lr_steps_warmup={cf.lr_steps_warmup} and "
+            s += f" cf.lr_steps_cooldown={cf.lr_steps_cooldown} so that steps_decay={steps_decay}."
+            logging.getLogger("obslearn").warning(s)
         self.lr_scheduler = LearningRateScheduler(
             self.optimizer,
             cf.batch_size,
@@ -558,7 +572,11 @@ def compute_loss(
                         )
                         if tro_type == "token":
                             pred = pred.reshape(
-                                [*pred.shape[:2], target.shape[-2], target.shape[-1] - gs]
+                                [
+                                    *pred.shape[:2],
+                                    target.shape[-2],
+                                    target.shape[-1] - gs,
+                                ]
                             )
                             pred = torch.cat([pred[:, i, :l] for i, l in enumerate(sl)], 1)
                     else:
@@ -600,7 +618,7 @@ def compute_loss(
                                             target_data[mask, i],
                                             pred[:, mask, i],
                                             pred[:, mask, i].mean(0),
-                                            pred[:, mask, i].std(0) if ens else torch.zeros(1),
+                                            (pred[:, mask, i].std(0) if ens else torch.zeros(1)),
                                         )
                                         val_uw += temp.item()
                                         val = val + channel_loss_weight[i] * temp  # * tw[jj]
@@ -613,9 +631,11 @@ def compute_loss(
                                         target_data[mask_nan[:, i], i],
                                         pred[:, mask_nan[:, i], i],
                                         pred[:, mask_nan[:, i], i].mean(0),
-                                        pred[:, mask_nan[:, i], i].std(0)
-                                        if ens
-                                        else torch.zeros(1),
+                                        (
+                                            pred[:, mask_nan[:, i], i].std(0)
+                                            if ens
+                                            else torch.zeros(1)
+                                        ),
                                     )
                                     val_uw += temp.item()
                                     val = val + channel_loss_weight[i] * temp
@@ -1028,7 +1048,10 @@ def log_terminal(self, bidx, epoch):
                 )
                 print("\t", end="")
                 for i_obs, rt in enumerate(self.cf.streams):
-                    print("{}".format(rt["name"]) + f" : {l_avg[0, i_obs]:0.4E} \t", end="")
+                    print(
+                        "{}".format(rt["name"]) + f" : {l_avg[0, i_obs]:0.4E} \t",
+                        end="",
+                    )
                 print("\n", flush=True)
 
             self.t_start = time.time()
diff --git a/src/weathergen/train/trainer_base.py b/src/weathergen/train/trainer_base.py
@@ -11,7 +11,7 @@
 import itertools
 import logging
 import os
-import pathlib
+from pathlib import Path
 
 import pynvml
 import torch
@@ -22,6 +22,8 @@
 from weathergen.train.utils import str_to_tensor, tensor_to_str
 from weathergen.utils.config import Config
 
+_logger = logging.getLogger(__name__)
+
 
 class Trainer_Base:
     def __init__(self):
@@ -114,11 +116,14 @@ def init_streams(cf: Config, run_id_contd):
         # warn if specified dir does not exist
         if not os.path.isdir(cf.streams_directory):
             sd = cf.streams_directory
-            logging.getLogger("obslearn").warning(f"Streams directory {sd} does not exist.")
+            _logger.warning(f"Streams directory {sd} does not exist.")
 
         # read all reportypes from directory, append to existing ones
         temp = {}
-        for fh in sorted(pathlib.Path(cf.streams_directory).rglob("*.yml")):
+        streams_dir = Path(cf.streams_directory).absolute()
+        _logger.info(f"Reading streams from {streams_dir}")
+
+        for fh in sorted(streams_dir.rglob("*.yml")):
             stream_parsed = yaml.safe_load(fh.read_text())
             if stream_parsed is not None:
                 temp.update(stream_parsed)
@@ -131,7 +136,7 @@ def init_streams(cf: Config, run_id_contd):
         # flatten list
         rts = list(itertools.chain.from_iterable(rts))
         if len(rts) != len(list(set(rts))):
-            logging.getLogger("obslearn").warning("Duplicate reportypes specified.")
+            _logger.warning("Duplicate reportypes specified.")
 
         return cf
 
diff --git a/src/weathergen/utils/config.py b/src/weathergen/utils/config.py
@@ -9,6 +9,10 @@
 
 import json
 import os
+from pathlib import Path
+from typing import Any
+
+import yaml
 
 
 ###########################################
@@ -63,3 +67,13 @@ def load(run_id, epoch=None):
         cf.__dict__ = json.loads(json_str[0])
 
         return cf
+
+
+# Function that checks if WEATHERGEN_PRIVATE_HOME is set and returns it:
+def private_conf() -> Any:
+    if "WEATHERGEN_PRIVATE_CONF" in os.environ:
+        private_home = Path(os.environ["WEATHERGEN_PRIVATE_CONF"])
+        private_conf = yaml.safe_load(private_home.read_text())
+        return private_conf
+    else:
+        raise ValueError("WEATHERGEN_PRIVATE_CONF is not set.")
diff --git a/src/weathergen/utils/logger.py b/src/weathergen/utils/logger.py
@@ -23,10 +23,21 @@ def format(self, record):
         return super().format(record)
 
 
-logger = logging.getLogger("obslearn")
-logger.setLevel(logging.DEBUG)
-ch = logging.StreamHandler()
-formatter = RelPathFormatter("%(pathname)s:%(lineno)d : %(levelname)-8s : %(message)s")
-ch.setFormatter(formatter)
-logger.handlers.clear()
-logger.addHandler(ch)
+def init_loggers():
+    """
+    Initialize the logger for the package.
+
+    WARNING: this function resets all the logging handlers.
+    """
+    formatter = RelPathFormatter("%(pathname)s:%(lineno)d : %(levelname)-8s : %(message)s")
+    for package in ["obslearn", "weathergen"]:
+        logger = logging.getLogger(package)
+        logger.handlers.clear()
+        logger.setLevel(logging.DEBUG)
+        ch = logging.StreamHandler()
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+
+
+# TODO: remove, it should be module-level loggers
+logger = logging.getLogger("weathergen")
diff --git a/src/weathergen/utils/run_id.py b/src/weathergen/utils/run_id.py
@@ -7,7 +7,7 @@
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 
-from obslearn.train.utils import get_run_id
+from weathergen.train.utils import get_run_id
 
 if __name__ == "__main__":
     print(get_run_id())
diff --git a/src/weathergenerator_utils/__init__.py b/src/weathergenerator_utils/__init__.py