[219] Ensures the run_id passed through the command line are propagated (ecmwf#252)

tjhunter · web-flow · commit c4afa5c1b750 · 2025-05-21T08:57:09.000+01:00
* changes

* cleanup

* changes

* changes

* changes

* changes

* changes

* changes

* changes

* changes

* changes

* changes

* changes

* changes

* changes

* changes

* changes

* changes

* changes

* changes

* fixes

* fixes

* fixes

* fixes
diff --git a/integration_tests/small1.py b/integration_tests/small1.py
@@ -55,10 +55,12 @@ def test_train(setup, test_run_id):
     )
 
     evaluate_from_args(
-        "-start 2022-10-10 -end 2022-10-11 --samples 10 --same_run_id --epoch 0".split()
+        "-start 2022-10-10 -end 2022-10-11 --samples 10 --epoch 0".split()
         + [
             "--run_id",
             test_run_id,
+            "--eval_run_id",
+            test_run_id,
             "--config",
             f"{weathergen_home}/integration_tests/small1.yaml",
         ]
diff --git a/src/weathergen/__init__.py b/src/weathergen/__init__.py
@@ -91,12 +91,11 @@ def evaluate_from_args(argl: list[str]):
         help="Path to private configuration file for paths.",
     )
     parser.add_argument(
-        "-n",
-        "--same_run_id",
+        "--eval_run_id",
+        type=str,
         required=False,
-        dest="run_id_new",
-        action="store_false",
-        help="store evaluation results in the same folder as run_id",
+        dest="eval_run_id",
+        help="(optional) if specified, uses the provided run id to store the evaluation results",
     )
     parser.add_argument(
         "--config",
@@ -133,7 +132,7 @@ def evaluate_from_args(argl: list[str]):
     cf.loader_num_workers = min(cf.loader_num_workers, args.samples)
 
     trainer = Trainer()
-    trainer.evaluate(cf, args.run_id, args.epoch, args.run_id_new)
+    trainer.evaluate(cf, args.run_id, args.epoch, run_id_new=args.eval_run_id)
 
 
 ####################################################################################################
@@ -273,7 +272,7 @@ def train_with_args(argl: list[str], stream_dir: str | None):
     trainer = Trainer(checkpoint_freq=250, print_freq=10)
 
     try:
-        trainer.run(cf)
+        trainer.run(cf, run_id_new=args.run_id)
     except Exception:
         extype, value, tb = sys.exc_info()
         traceback.print_exc()
diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
@@ -28,6 +28,7 @@
 from weathergen.train.trainer_base import Trainer_Base
 from weathergen.train.utils import get_run_id
 from weathergen.utils.config import Config
+from weathergen.utils.distributed import is_root
 from weathergen.utils.train_logger import TrainLogger
 from weathergen.utils.validation_io import write_validation
 
@@ -48,12 +49,12 @@ def init(
         cf: Config,
         run_id_contd=None,
         epoch_contd=None,  # unused
-        run_id_new=False,
+        run_id_new: bool | str | None = False,
         run_mode="training",  # unused
     ):
         self.cf = cf
 
-        if isinstance(run_id_new, str):
+        if run_id_new is not None and isinstance(run_id_new, str):
             cf.run_id = run_id_new
         elif run_id_new or cf.run_id is None:
             cf.run_id = get_run_id()
@@ -64,6 +65,7 @@ def init(
         assert cf.samples_per_epoch % cf.batch_size == 0
         assert cf.samples_per_validation % cf.batch_size_validation == 0
 
+        _logger.info(f"Starting run with id: {cf.run_id}")
         self.devices = self.init_torch()
 
         self.init_ddp(cf)
@@ -82,7 +84,6 @@ def init(
         self.path_run = path_run
 
         self.init_perf_monitoring()
-
         self.train_logger = TrainLogger(cf, self.path_run)
 
     ###########################################
@@ -134,7 +135,7 @@ def evaluate(self, cf, run_id_trained, epoch, run_id_new=False):
         _logger.info(f"Finished evaluation run with id: {cf.run_id}")
 
     ###########################################
-    def run(self, cf, run_id_contd=None, epoch_contd=None, run_id_new=False):
+    def run(self, cf, run_id_contd=None, epoch_contd=None, run_id_new: bool | str = False):
         # general initalization
         self.init(cf, run_id_contd, epoch_contd, run_id_new)
 
@@ -169,6 +170,7 @@ def run(self, cf, run_id_contd=None, epoch_contd=None, run_id_new=False):
         self.model = Model(cf, sources_size, targets_num_channels, targets_coords_size).create()
         # load model if specified
         if run_id_contd is not None:
+            _logger.info(f"Continuing run with id={run_id_contd} at epoch {epoch_contd}.")
             self.model.load(run_id_contd, epoch_contd)
             _logger.info(f"Loaded model id={run_id_contd}.")
 
@@ -278,7 +280,7 @@ def run(self, cf, run_id_contd=None, epoch_contd=None, run_id_new=False):
         if cf.forecast_policy is not None:
             torch._dynamo.config.optimize_ddp = False
 
-        if self.cf.rank == 0:
+        if is_root():
             config.save(self.cf, None)
             config.print_cf(self.cf)
 
@@ -674,7 +676,7 @@ def save_model(self, epoch: int, name=None):
         else:
             state = self.ddp_model.state_dict()
 
-        if self.cf.rank == 0:
+        if is_root():
             filename = "".join(
                 [
                     self.cf.run_id,
diff --git a/src/weathergen/train/trainer_base.py b/src/weathergen/train/trainer_base.py
@@ -8,16 +8,18 @@
 # nor does it submit to any jurisdiction.
 
 import datetime
+import errno
 import logging
 import os
+import socket
 
 import pynvml
 import torch
 import torch.distributed as dist
 import torch.multiprocessing
-import torch.utils.data.distributed
 
 from weathergen.train.utils import str_to_tensor, tensor_to_str
+from weathergen.utils.distributed import is_root
 
 _logger = logging.getLogger(__name__)
 
@@ -70,27 +72,61 @@ def init_ddp(cf):
             cf.with_ddp = False
             cf.rank = rank
             cf.num_ranks = num_ranks
+            _logger.info(
+                "DDP not initialized. MASTER_ADDR not set. Running in single process mode."
+            )
             return
 
         local_rank = int(os.environ.get("SLURM_LOCALID"))
         ranks_per_node = int(os.environ.get("SLURM_TASKS_PER_NODE", "1")[0])
         rank = int(os.environ.get("SLURM_NODEID")) * ranks_per_node + local_rank
         num_ranks = int(os.environ.get("SLURM_NTASKS"))
+        _logger.info(
+            f"DDP initialization: local_rank={local_rank}, ranks_per_node={ranks_per_node}, rank={rank}, num_ranks={num_ranks}"
+        )
+
+        if rank == 0:
+            # Check that port 1345 is available, raise an error if not
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                try:
+                    s.bind((master_node, 1345))
+                except OSError as e:
+                    if e.errno == errno.EADDRINUSE:
+                        _logger.error(
+                            f"Port 1345 is already in use on {master_node}. Please check your network configuration."
+                        )
+                        raise
+                    else:
+                        _logger.error(f"Error while binding to port 1345 on {master_node}: {e}")
+                        raise
+
+        _logger.info(
+            f"Initializing DDP with rank {rank} out of {num_ranks} on master_node:{master_node}."
+        )
 
         dist.init_process_group(
             backend="nccl",
             init_method="tcp://" + master_node + ":1345",
-            timeout=datetime.timedelta(seconds=10 * 8192),
+            timeout=datetime.timedelta(seconds=240),
             world_size=num_ranks,
             rank=rank,
+            device_id=torch.device("cuda", local_rank),
         )
+        if is_root():
+            _logger.info("DDP initialized: root.")
+        # Wait for all ranks to reach this point
+        dist.barrier()
 
         # communicate run id to all nodes
-        run_id_int = torch.zeros(8, dtype=torch.int32).cuda()
-        if rank == 0:
+        len_run_id = len(cf.run_id)
+        run_id_int = torch.zeros(len_run_id, dtype=torch.int32).cuda()
+        if is_root():
+            _logger.info(f"Communicating run_id to all nodes: {cf.run_id}")
             run_id_int = str_to_tensor(cf.run_id).cuda()
         dist.all_reduce(run_id_int, op=torch.distributed.ReduceOp.SUM)
-        cf.run_id = tensor_to_str(run_id_int)
+        if not is_root():
+            cf.run_id = tensor_to_str(run_id_int)
+        _logger.info(f"rank: {rank} has run_id: {cf.run_id}")
 
         # communicate data_loader_rng_seed
         if hasattr(cf, "data_loader_rng_seed"):
@@ -101,6 +137,7 @@ def init_ddp(cf):
                 dist.all_reduce(l_seed, op=torch.distributed.ReduceOp.SUM)
                 cf.data_loader_rng_seed = l_seed.item()
 
+        # TODO: move outside of the config
         cf.rank = rank
         cf.num_ranks = num_ranks
         cf.with_ddp = True
diff --git a/src/weathergen/utils/config.py b/src/weathergen/utils/config.py
@@ -127,8 +127,26 @@ def _load_private_conf(private_home: Path | None) -> OmegaConf:
         _logger.info(f"Loading private config fromWEATHERGEN_PRIVATE_CONF:{private_home}.")
 
     elif env_script_path.is_file():
+        _logger.info(f"Loading private config from platform-env.py: {env_script_path}.")
+        # This code does many checks to ensure that any error message is surfaced. Since it is a process call,
+        # it can be hard to diagnose the error.
+        # TODO: eventually, put all this wrapper code in a separate function
+        try:
+            result_hpc = subprocess.run(
+                [str(env_script_path), "hpc"], capture_output=True, text=True, check=True
+            )
+        except subprocess.CalledProcessError as e:
+            _logger.error(
+                f"Error while running platform-env.py: {e} {e.stderr} {e.stdout} {e.output} {e.returncode}"
+            )
+            raise
+        if result_hpc.returncode != 0:
+            _logger.error(f"Error while running platform-env.py: {result_hpc.stderr.strip()}")
+            raise RuntimeError(f"Error while running platform-env.py: {result_hpc.stderr.strip()}")
+        _logger.info(f"Detected HPC: {result_hpc.stdout.strip()}.")
+
         result = subprocess.run(
-            [str(env_script_path), "hpc-config"], capture_output=True, text=True
+            [str(env_script_path), "hpc-config"], capture_output=True, text=True, check=True
         )
         private_home = Path(result.stdout.strip())
         _logger.info(f"Loading private config from platform-env.py output: {private_home}.")
diff --git a/src/weathergen/utils/distributed.py b/src/weathergen/utils/distributed.py
@@ -0,0 +1,34 @@
+"""
+Utilities for writing distributed pytorch-based code.
+
+This module is adapted from code by Seb Hoffamn at:
+https://github.com/sehoffmann/dmlcloud/blob/develop/dmlcloud/core/distributed.py
+
+(same license as the rest of the code)
+Copyright (c) 2025, Sebastian Hoffmann
+"""
+
+# TODO: copy other utilities from dmlcloud such as root_wrap etc.
+# TODO: move the DDP code from trainer.py to this file
+
+import torch.distributed as dist
+
+SYNC_TIMEOUT_SEC = 60 * 60  # 1 hour
+
+
+def is_root(pg: dist.ProcessGroup | None = None) -> bool:
+    """
+    Check if the current rank is the root rank (rank 0).
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None (default), the default process group will be used.
+    """
+    if not _is_distributed_initialized():
+        # If not initialized, it assumed to be in single process mode.
+        # TODO: check what should happen if a process group is passed
+        return True
+    return dist.get_rank(pg) == 0
+
+
+def _is_distributed_initialized():
+    return dist.is_available() and dist.is_initialized()
diff --git a/src/weathergen/utils/logger.py b/src/weathergen/utils/logger.py
@@ -31,9 +31,12 @@ def init_loggers():
 
     WARNING: this function resets all the logging handlers.
 
-    This function can be called only once, so that it can be called repeatedly in multiprocessing pipelines.
+    This function follows a singleton pattern, it will only operate once per process
+    and will be a no-op if called again.
     """
-    formatter = RelPathFormatter("%(pathname)s:%(lineno)d : %(levelname)-8s : %(message)s")
+    formatter = RelPathFormatter(
+        "%(asctime)s %(pathname)s:%(lineno)d : %(levelname)-8s : %(message)s"
+    )
     for package in ["obslearn", "weathergen"]:
         logger = logging.getLogger(package)
         logger.handlers.clear()

Original file line number	Diff line number	Diff line change
`@@ -55,10 +55,12 @@ def test_train(setup, test_run_id):`
`55`	`55`	`)`
`56`	`56`
`57`	`57`	`evaluate_from_args(`
`58`		`- "-start 2022-10-10 -end 2022-10-11 --samples 10 --same_run_id --epoch 0".split()`
	`58`	`+ "-start 2022-10-10 -end 2022-10-11 --samples 10 --epoch 0".split()`
`59`	`59`	`+ [`
`60`	`60`	`"--run_id",`
`61`	`61`	`test_run_id,`
	`62`	`+ "--eval_run_id",`
	`63`	`+ test_run_id,`
`62`	`64`	`"--config",`
`63`	`65`	`f"{weathergen_home}/integration_tests/small1.yaml",`
`64`	`66`	`]`