Continue training through slurm script (ecmwf#395)

Jubeku · Julian Kuehnert · web-flow · commit 632296822462 · 2025-07-08T18:26:26.000+02:00
* train_continue via slurm

* using __main__ as entry point for slurm script

* reverting config files to match base branch

* reverting config files to match base branch

* removing param_sum control logging before and after loading of model weights

* run ruff

* check whether from_run_id is in arguments

* trigger PR check

* remove block to set reuse_run_id=True

---------

Co-authored-by: Julian Kuehnert &lt;julian.kuehnert@ecwmf.int&gt;
diff --git a/config/default_config.yml b/config/default_config.yml
@@ -131,4 +131,3 @@ run_id: ???
 train_log:
   # The period to log metrics (in number of batch steps)
   log_interval: 20
-
diff --git a/src/weathergen/run_train.py b/src/weathergen/run_train.py
@@ -173,5 +173,9 @@ def train_with_args(argl: list[str], stream_dir: str | None):
 
 
 if __name__ == "__main__":
-    train()
-    # train_continue()
+    # Entry point for slurm script.
+    # Check whether --from_run_id passed as argument.
+    if "--from_run_id" in sys.argv:
+        train_continue()
+    else:
+        train()
diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
@@ -16,8 +16,13 @@
 import tqdm
 from torch.distributed.fsdp import FullStateDictConfig, StateDictType
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision, ShardingStrategy
-from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy  # default_auto_wrap_policy,
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    MixedPrecision,
+    ShardingStrategy,
+)
+from torch.distributed.fsdp.wrap import (
+    size_based_auto_wrap_policy,  # default_auto_wrap_policy,
+)
 
 import weathergen.train.loss as losses
 import weathergen.utils.config as config
@@ -377,7 +382,12 @@ def compute_loss(
         # assert len(targets_rt) == len(preds) and len(preds) == len(self.cf.streams)
         for fstep in range(len(targets_rt)):
             for i_obs, (target, target_coords, si) in enumerate(
-                zip(targets_rt[fstep], targets_coords_rt[fstep], self.cf.streams, strict=False)
+                zip(
+                    targets_rt[fstep],
+                    targets_coords_rt[fstep],
+                    self.cf.streams,
+                    strict=False,
+                )
             ):
                 pred = preds[fstep][i_obs]