Distinguish between primary runs ('candidates') and secondary runs (#64)

dnerini · web-flow · commit 05d327612d6e · 2025-10-20T14:27:18.000+02:00
* Distinguish between primary runs ('candidates') and secondary runs

* Docstrings
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -54,7 +54,7 @@ rule sandbox_all:
     input:
         expand(
             rules.create_inference_sandbox.output.sandbox,
-            run_id=collect_all_runs(),
+            run_id=collect_all_candidates(),
         ),
 
 
@@ -64,7 +64,7 @@ rule run_inference_all:
         expand(
             OUT_ROOT / "data/runs/{run_id}/{init_time}/raw",
             init_time=[t.strftime("%Y%m%d%H%M") for t in REFTIMES],
-            run_id=collect_all_runs(),
+            run_id=collect_all_candidates(),
         ),
 
 
@@ -73,7 +73,7 @@ rule verif_metrics_all:
         expand(
             rules.verif_metrics.output,
             init_time=[t.strftime("%Y%m%d%H%M") for t in REFTIMES],
-            run_id=collect_all_runs(),
+            run_id=collect_all_candidates(),
         ),
 
 
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -66,12 +66,13 @@ REFTIMES = _reftimes()
 
 
 def collect_all_runs():
-    """Collect all runs defined in the configuration."""
+    """Collect all runs defined in the configuration, including secondary runs."""
     runs = {}
     for run_entry in copy.deepcopy(config["runs"]):
         model_type = next(iter(run_entry))
         run_config = run_entry[model_type]
         run_config["model_type"] = model_type
+        run_config["is_candidate"] = True
         run_id = run_config["mlflow_id"][0:9]
 
         if model_type == "interpolator":
@@ -82,6 +83,7 @@ def collect_all_runs():
                 # Ensure a proper 'forecaster' entry exists with model_type
                 fore_cfg = copy.deepcopy(run_config["forecaster"])
                 fore_cfg["model_type"] = "forecaster"
+                fore_cfg["is_candidate"] = False  # exclude from outputs
                 runs[tail_id] = fore_cfg
             run_id = f"{run_id}-{tail_id}"
 
@@ -90,6 +92,16 @@ def collect_all_runs():
     return runs
 
 
+def collect_all_candidates():
+    """Collect participating runs ('candidates') only."""
+    runs = collect_all_runs()
+    candidates = {}
+    for run_id, run_config in runs.items():
+        if run_config.get("is_candidate", False):
+            candidates[run_id] = run_config
+    return candidates
+
+
 def collect_all_baselines():
     """Collect all baselines defined in the configuration."""
     baselines = {}
@@ -106,7 +118,8 @@ def collect_experiment_participants():
     for base in BASELINE_CONFIGS.keys():
         participants[base] = OUT_ROOT / f"data/baselines/{base}/verif_aggregated.nc"
     for exp in RUN_CONFIGS.keys():
-        participants[exp] = OUT_ROOT / f"data/runs/{exp}/verif_aggregated.nc"
+        if RUN_CONFIGS[exp].get("is_candidate", False):
+            participants[exp] = OUT_ROOT / f"data/runs/{exp}/verif_aggregated.nc"
     return participants