Misc fixes (ecmwf#171)

clessig · web-flow · commit c42c3e78fc37 · 2025-04-09T16:40:29.000+02:00
* Fixed bug in handling of forecast step for rollout.

* Fixed problems where in complete configs were saved.

* Fixed problems in handling of forecast policy

* Ruffed
diff --git a/src/weathergen/datasets/multi_stream_data_sampler.py b/src/weathergen/datasets/multi_stream_data_sampler.py
@@ -39,17 +39,16 @@ def __init__(self, cf, start_date, end_date, batch_size, samples_per_epoch, shuf
         self.len_hrs = cf.len_hrs
         self.step_hrs = cf.step_hrs
 
-        fc_policy_seq = (
-            cf.forecast_policy == "sequential" or cf.forecast_policy == "sequential_random"
-        )
-        assert cf.forecast_steps >= 0 if not fc_policy_seq else True
         self.forecast_delta_hrs = (
             cf.forecast_delta_hrs if cf.forecast_delta_hrs > 0 else self.len_hrs
         )
         assert self.forecast_delta_hrs == self.len_hrs, "Only supported option at the moment"
         self.forecast_steps = np.array(
             [cf.forecast_steps] if type(cf.forecast_steps) == int else cf.forecast_steps
         )
+        if cf.forecast_policy is not None:
+            if self.forecast_steps.max() == 0:
+                logger.warning("forecast policy is not None but number of forecast steps is 0.")
         self.forecast_policy = cf.forecast_policy
 
         # end date needs to be adjusted to account for window length
@@ -194,7 +193,11 @@ def get_targets_coords_size(self):
 
     ###################################################
     def reset(self):
-        fsm = self.forecast_steps[min(self.epoch, len(self.forecast_steps) - 1)]
+        fsm = (
+            self.forecast_steps[min(self.epoch, len(self.forecast_steps) - 1)]
+            if self.forecast_policy != "random"
+            else self.forecast_steps.max()
+        )
         if fsm > 0:
             logger.info(f"forecast_steps at epoch={self.epoch} : {fsm}")
 
@@ -319,7 +322,7 @@ def __iter__(self):
 
                     # collect for all forecast steps
                     for fstep in range(forecast_dt + 1):
-                        # collect all sources
+                        # collect all targets
                         for _, ds in enumerate(stream_ds):
                             step_forecast_dt = (
                                 idx + (self.forecast_delta_hrs * fstep) // self.step_hrs
diff --git a/src/weathergen/model/model.py b/src/weathergen/model/model.py
@@ -581,12 +581,12 @@ def forward(self, model_params, batch, forecast_steps):
 
         # roll-out in latent space
         preds_all = []
-        for _ in range(forecast_steps):
+        for fstep in range(forecast_steps):
             # prediction
             preds_all += [
                 self.predict(
                     model_params,
-                    forecast_steps,
+                    fstep,
                     tokens,
                     streams_data,
                     target_coords_idxs,
@@ -610,7 +610,6 @@ def forward(self, model_params, batch, forecast_steps):
 
     #########################################
     def embed_cells(self, model_params, streams_data):
-        # code.interact( local=locals())
         source_tokens_lens = torch.stack(
             [
                 torch.stack(
diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
@@ -85,10 +85,6 @@ def init(
         if self.cf.rank == 0:
             path_run.mkdir(exist_ok=True)
             path_model.mkdir(exist_ok=True)
-            # save config
-            cf.save()
-            if run_mode == "training":
-                cf.print()
         self.path_run = path_run
 
         self.init_perf_monitoring()
@@ -136,6 +132,9 @@ def evaluate(self, cf, run_id_trained, epoch, run_id_new=False):
         for name, w in cf.loss_fcts_val:
             self.loss_fcts_val += [[getattr(losses, name), w]]
 
+        if self.cf.rank == 0:
+            self.cf.save()
+
         # evaluate validation set
         self.validate(epoch=0)
         print(f"Finished evaluation run with id: {cf.run_id}")
@@ -425,6 +424,7 @@ def run(self, cf, private_cf, run_id_contd=None, epoch_contd=None, run_id_new=Fa
             torch._dynamo.config.optimize_ddp = False
 
         if self.cf.rank == 0:
+            self.cf.save()
             self.cf.print()
 
         # training loop