Hard coded model and results paths ->private_config - Ktezcan/develop/iss49 paths model run (ecmwf#83)

kctezcan · Kerem Tezcan · web-flow · commit 8022b35eddbf · 2025-03-25T13:52:57.000+01:00
* changed model and run paths from hardcoded to private config

* dummy commit for email

* corrected some mistakes

* fixing/testing loading config

* fixed and tested config loader

* reverted date reqs

* cleanup

* fixed more paths in the trainer.py file

* small fix and more cleanups

* removed unused import

* converted to pathlib in model.py and trainer.py

* converted paths to pathlib in config.py and train_logger.py

* fixed: code expecting string path, got PAth object

* reverted hanges allowing the cod to run

* removed the #KCT:path comment

* implement Tim's clarification

* still removing KCT:path comments

* corrected the typo again

* removed the KCT:path comment yet again

* changed from print to logger.info()

* corrected mistake in config.py

---------

Co-authored-by: Kerem Tezcan &lt;ktezcan@balfrin-ln004.cscs.ch&gt;
diff --git a/src/weathergen/__init__.py b/src/weathergen/__init__.py
@@ -88,14 +88,27 @@ def evaluate():
         default=["ERA5"],
         help="Analysis output streams during evaluation.",
     )
+    parser.add_argument(
+        "--private_config",
+        type=str,
+        default=None,
+        help="Path to private configuration file for paths.",
+    )
 
     args = parser.parse_args()
 
+    # get the paths from the private config
+    private_cf = load_private_conf(args.private_config)
+
     # TODO: move somewhere else
     init_loggers()
 
-    # load config if specified
-    cf = Config.load(args.run_id, args.epoch)
+    # load config: if run_id is full path, it loads from there
+    cf = Config.load(args.run_id, args.epoch, private_cf["model_path"])
+
+    # add parameters from private (paths) config
+    for k, v in private_cf.items():
+        setattr(cf, k, v)
 
     cf.run_history += [(cf.run_id, cf.istep)]
 
diff --git a/src/weathergen/model/model.py b/src/weathergen/model/model.py
@@ -537,7 +537,7 @@ def print_num_parameters(self):
 
     #########################################
     def load(self, run_id, epoch=None):
-        path_run = Path("./models/") / run_id
+        path_run = Path(self.cf.model_path) / run_id
         epoch_id = f"epoch{epoch:05d}" if epoch is not None else "latest"
         filename = f"{run_id}_{epoch_id}.chkpt"
 
diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
@@ -78,8 +78,8 @@ def init(
         cf = self.init_streams(cf, run_id_contd)
 
         # create output directory
-        path_run = Path("./results") / cf.run_id
-        path_model = Path("./models") / cf.run_id
+        path_run = Path(cf.run_path) / cf.run_id
+        path_model = Path(cf.model_path) / cf.run_id
         if self.cf.rank == 0:
             path_run.mkdir(exist_ok=True)
             path_model.mkdir(exist_ok=True)
@@ -790,12 +790,6 @@ def batch_to_device(self, batch):
 
     ###########################################
     def save_model(self, epoch=-1, name=None):
-        path_model = Path("./models/") / self.cf.run_id
-        epoch_str = "latest" if epoch == -1 else f"epoch{epoch:05d}"
-        name_str = f"_{name}" if name is not None else ""
-        file_out = path_model / f"{self.cf.run_id}_{epoch_str}_{name_str}.chkpt"
-        temp_file_out = path_model / f"{self.cf.run_id}_{epoch_str}_{name_str}_temp.chkpt"
-
         if self.cf.with_ddp and self.cf.with_fsdp:
             _cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
             with FSDP.state_dict_type(
@@ -808,10 +802,22 @@ def save_model(self, epoch=-1, name=None):
             state = self.ddp_model.state_dict()
 
         if self.cf.rank == 0:
+            filename = "".join(
+                [
+                    self.cf.run_id,
+                    "_",
+                    "latest" if epoch == -1 else f"epoch{epoch:05d}",
+                    ("_" + name) if name is not None else "",
+                ]
+            )
+            base_path = Path(self.cf.model_path) / self.cf.run_id
+            file_out: Path = base_path / (filename + ".chkpt")
+            file_tmp: Path = base_path / (filename + "_tmp.chkpt")
             # save temp file (slow)
-            torch.save(state, temp_file_out)
+            torch.save(state, file_tmp)
             # move file (which is changing the link in the file system and very fast)
-            temp_file_out.replace(file_out)
+            file_tmp.replace(file_out)
+
             # save config
             self.cf.save(epoch)
 
diff --git a/src/weathergen/utils/config.py b/src/weathergen/utils/config.py
@@ -13,6 +13,8 @@
 
 import yaml
 
+from weathergen.utils.logger import logger
+
 
 ###########################################
 class Config:
@@ -30,7 +32,7 @@ def print(self):
                         print("{}{} : {}".format("" if k == "reportypes" else "  ", k, v))
 
     def save(self, epoch=None):
-        path_models = Path("./models")
+        path_models = Path(self.model_path)
         # save in directory with model files
         dirname = path_models / self.run_id
         dirname.mkdir(exist_ok=True, parents=True)
@@ -41,21 +43,29 @@ def save(self, epoch=None):
         fname = dirname / f"model_{self.run_id}{epoch_str}.json"
 
         json_str = json.dumps(self.__dict__)
-        with open(fname, "w") as f:
+        with fname.open("w") as f:
             f.write(json_str)
 
     @staticmethod
-    def load(run_id, epoch=None):
-        if "/" in run_id:  # assumed to be full path instead of just id
+    def load(run_id: str, epoch: int = None, model_path: str = "./models") -> "Config":
+        """
+        Load a configuration file from a given run_id and epoch.
+        If run_id is a full path, loads it from the full path.
+        """
+        if Path(run_id).exists():  # load from the full path if a full path is provided
             fname = Path(run_id)
+            logger.info(f"Loading config from provided full run_id path: {fname}")
         else:
-            path_models = Path("./models")
+            path_models = Path(model_path)
             epoch_str = ""
             if epoch is not None:
                 epoch_str = "_latest" if epoch == -1 else f"_epoch{epoch:05d}"
             fname = path_models / run_id / f"model_{run_id}{epoch_str}.json"
 
-        with open(fname) as f:
+            logger.info(f"Loading config from specified run_id and epoch: {fname}")
+
+        # open the file and read into a config object
+        with fname.open() as f:
             json_str = f.readlines()
 
         cf = Config()
diff --git a/src/weathergen/utils/train_logger.py b/src/weathergen/utils/train_logger.py
@@ -58,6 +58,7 @@ def log_metrics(self, stage: Stage, metrics: dict[str, float]) -> None:
 
         # TODO: performance: we repeatedly open the file for each call. Better for multiprocessing
         # but we can probably do better and rely for example on the logging module.
+
         with open(self.path_run / "metrics.json", "ab") as f:
             s = json.dumps(clean_metrics) + "\n"
             f.write(s.encode("utf-8"))
@@ -139,7 +140,7 @@ def read(run_id, epoch=-1):
         cf = Config.load(run_id, epoch)
         run_id = cf.run_id
 
-        result_dir = Path(f"./results/{run_id}")
+        result_dir = Path(cf.run_path) / run_id
         fname_log_train = result_dir / f"{run_id}_train_log.txt"
         fname_log_val = result_dir / f"{run_id}_val_log.txt"
         fname_perf_val = result_dir / f"{run_id}_perf_log.txt"