[103] Initial profiling tools (ecmwf#104)

tjhunter · web-flow · commit 1ecf1610254f · 2025-03-24T17:53:33.000+01:00
* profiling

* adding viztracer

* work

* annotations

* better

* comments

* working

* cleanup

* changes

* comments

* changes

* changes

* changes

* fix
diff --git a/config/profiling/annotations.json b/config/profiling/annotations.json
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,9 +12,9 @@ requires-python = ">=3.11,<3.13"
 dependencies = [
  'torch==2.6.0',
  'numpy~=2.2',
- 'astropy_healpix~=1.0',
+ 'astropy_healpix~=1.1.2',
  'zarr~=2.17',
- 'anemoi-datasets~=0.5',
+ 'anemoi-datasets~=0.5.16',
  'pandas~=2.2',
  'pynvml',
  'tqdm',
diff --git a/src/weathergen/datasets/anemoi_dataset.py b/src/weathergen/datasets/anemoi_dataset.py
@@ -8,10 +8,13 @@
 # nor does it submit to any jurisdiction.
 
 import datetime
+import logging
 
 import numpy as np
 from anemoi.datasets import open_dataset
 
+_logger = logging.getLogger(__name__)
+
 
 class AnemoiDataset:
     "Wrapper for Anemoi dataset"
@@ -30,26 +33,26 @@ def __init__(
         assert len_hrs == step_hrs, "Currently only step_hrs=len_hrs is supported"
 
         # open  dataset to peak that it is compatible with requested parameters
-        self.ds = open_dataset(filename)
+        ds = open_dataset(filename)
 
         # check that start and end time are within the dataset time range
 
-        ds_dt_start = self.ds.dates[0]
-        ds_dt_end = self.ds.dates[-1]
+        ds_dt_start = ds.dates[0]
+        ds_dt_end = ds.dates[-1]
 
         format_str = "%Y%m%d%H%M%S"
         dt_start = datetime.datetime.strptime(str(start), format_str)
         dt_end = datetime.datetime.strptime(str(end), format_str)
 
         # TODO, TODO, TODO: we need proper alignment for the case where self.ds.frequency
         # is not a multile of len_hrs
-        self.num_steps_per_window = int((len_hrs * 3600) / self.ds.frequency.seconds)
+        self.num_steps_per_window = int((len_hrs * 3600) / ds.frequency.seconds)
 
         # open dataset
 
         # caches lats and lons
-        self.latitudes = self.ds.latitudes.astype(np.float32)
-        self.longitudes = self.ds.longitudes.astype(np.float32)
+        self.latitudes = ds.latitudes.astype(np.float32)
+        self.longitudes = ds.longitudes.astype(np.float32)
 
         # TODO: define in base class
         self.geoinfo_idx = []
@@ -59,8 +62,8 @@ def __init__(
         source_channels = stream_info["source"] if "source" in stream_info else None
         self.source_idx = np.sort(
             [
-                self.ds.name_to_index[k]
-                for i, (k, v) in enumerate(self.ds.typed_variables.items())
+                ds.name_to_index[k]
+                for i, (k, v) in enumerate(ds.typed_variables.items())
                 if (
                     not v.is_computed_forcing
                     and not v.is_constant_in_time
@@ -75,8 +78,8 @@ def __init__(
         target_channels = stream_info["target"] if "target" in stream_info else None
         self.target_idx = np.sort(
             [
-                self.ds.name_to_index[k]
-                for i, (k, v) in enumerate(self.ds.typed_variables.items())
+                ds.name_to_index[k]
+                for (k, v) in ds.typed_variables.items()
                 if (
                     not v.is_computed_forcing
                     and not v.is_constant_in_time
@@ -88,21 +91,20 @@ def __init__(
                 )
             ]
         )
-        self.source_channels = [self.ds.variables[i] for i in self.source_idx]
-        self.target_channels = [self.ds.variables[i] for i in self.target_idx]
+        self.source_channels = [ds.variables[i] for i in self.source_idx]
+        self.target_channels = [ds.variables[i] for i in self.target_idx]
 
         self.properties = {
             "stream_id": 0,
         }
-        self.mean = self.ds.statistics["mean"]
-        self.stdev = self.ds.statistics["stdev"]
+        self.mean = ds.statistics["mean"]
+        self.stdev = ds.statistics["stdev"]
 
         # set dataset to None when no overlap with time range
         if dt_start >= ds_dt_end or dt_end <= ds_dt_start:
             self.ds = None
-            return
-
-        self.ds = open_dataset(self.ds, frequency=str(step_hrs) + "h", start=dt_start, end=dt_end)
+        else:
+            self.ds = open_dataset(ds, frequency=str(step_hrs) + "h", start=dt_start, end=dt_end)
 
     def __len__(self):
         "Length of dataset"
@@ -140,8 +142,10 @@ def _get(
             )
 
         # extract number of time steps and collapse ensemble dimension
+
         data = self.ds[idx : idx + self.num_steps_per_window][:, :, 0]
-        # extract channels
+
+        # # extract channels
         data = (
             data[:, channels_idx].transpose([0, 2, 1]).reshape((data.shape[0] * data.shape[2], -1))
         )
diff --git a/src/weathergen/datasets/batchifyer.py b/src/weathergen/datasets/batchifyer.py
@@ -23,6 +23,7 @@
     r3tos2,
     s2tor3,
 )
+from weathergen.utils.logger import init_loggers
 
 
 def encode_times_source(times, time_win) -> torch.tensor:
@@ -300,6 +301,7 @@ def batchify_source(
         time_win,
         normalizer,
     ):
+        init_loggers()
         si = stream_info
         token_size = si["token_size"]
         is_diagnostic = si["diagnostic"] if "diagnostic" in stream_info else False
diff --git a/src/weathergen/datasets/multi_stream_data_sampler.py b/src/weathergen/datasets/multi_stream_data_sampler.py
@@ -24,7 +24,7 @@
     compute_offsets_scatter_embed,
     compute_source_cell_lens,
 )
-from weathergen.utils.logger import logger
+from weathergen.utils.logger import init_loggers, logger
 
 
 class MultiStreamDataSampler(torch.utils.data.IterableDataset):
@@ -245,7 +245,7 @@ def __iter__(self):
             len : number of batch items
             len[*] : number of streams
         """
-
+        init_loggers()
         iter_start, iter_end = self.worker_workset()
 
         # create new shuffeling
diff --git a/src/weathergen/run_train.py b/src/weathergen/run_train.py
@@ -0,0 +1,9 @@
+"""
+The entry point to train the weathergen model.
+"""
+
+# For profiling tools, the entry point cannot be in an __init__.py file.
+from weathergen import train
+
+if __name__ == "__main__":
+    train()
diff --git a/src/weathergen/train/trainer_base.py b/src/weathergen/train/trainer_base.py
@@ -16,6 +16,7 @@
 import pynvml
 import torch
 import torch.distributed as dist
+import torch.multiprocessing
 import torch.utils.data.distributed
 import yaml
 
@@ -31,9 +32,19 @@ def __init__(self):
 
     ###########################################
     @staticmethod
-    def init_torch(use_cuda=True, num_accs_per_task=1):
+    def init_torch(use_cuda=True, num_accs_per_task=1, multiprocessing_method="fork"):
+        """
+        Initialize torch, set device and multiprocessing method.
+
+        NOTE: If using the Nvidia profiler, the multiprocessing method must be set to "spawn".
+        The default for linux systems is "fork", which prevents traces from being generated with DDP.
+        """
         torch.set_printoptions(linewidth=120)
 
+        # This strategy is required by the nvidia profiles to properly trace events in worker processes.
+        # This may cause issues with logging. Alternative: "fork"
+        torch.multiprocessing.set_start_method(multiprocessing_method, force=True)
+
         torch.backends.cuda.matmul.allow_tf32 = True
 
         use_cuda = torch.cuda.is_available()
diff --git a/src/weathergen/utils/logger.py b/src/weathergen/utils/logger.py
@@ -10,6 +10,7 @@
 import logging
 import os
 import pathlib
+from functools import cache
 
 
 class RelPathFormatter(logging.Formatter):
@@ -23,11 +24,14 @@ def format(self, record):
         return super().format(record)
 
 
+@cache
 def init_loggers():
     """
     Initialize the logger for the package.
 
     WARNING: this function resets all the logging handlers.
+
+    This function can be called only once, so that it can be called repeatedly in multiprocessing pipelines.
     """
     formatter = RelPathFormatter("%(pathname)s:%(lineno)d : %(levelname)-8s : %(message)s")
     for package in ["obslearn", "weathergen"]:
diff --git a/uv.lock b/uv.lock