ENH: Doc config in sheet (#869)

larsoner · web-flow · commit 5a1ef0ed7f66 · 2024-03-01T20:28:42.000+01:00
diff --git a/docs/source/v1.6.md.inc b/docs/source/v1.6.md.inc
@@ -9,8 +9,9 @@
 - Added saving of clean raw data in addition to epochs (#840 by @larsoner)
 - Added saving of detected blink and cardiac events used to calculate SSP projectors (#840 by @larsoner)
 - Added [`noise_cov_method`][mne_bids_pipeline._config.noise_cov_method] to allow for the use of methods other than `"shrunk"` for noise covariance estimation (#854 by @larsoner)
-- Added option to pass `image_kwargs` to [`mne.Report.add_epochs`] to allow adjusting e.g. `"vmin"` and `"vmax"` of the epochs image in the report via [`report_add_epochs_image_kwargs`][mne_bids_pipeline._config.report_add_epochs_image_kwargs] (#848 by @SophieHerbst)
+- Added option to pass `image_kwargs` to [`mne.Report.add_epochs`] to allow adjusting e.g. `"vmin"` and `"vmax"` of the epochs image in the report via [`report_add_epochs_image_kwargs`][mne_bids_pipeline._config.report_add_epochs_image_kwargs]. This feature requires MNE-Python 1.7 or newer. (#848 by @SophieHerbst)
 - Split ICA fitting and artifact detection into separate steps. This means that now, ICA is split into a total of three consecutive steps: fitting, artifact detection, and the actual data cleaning step ("applying ICA"). This makes it easier to experiment with different settings for artifact detection without needing to re-fit ICA. (#865 by @larsoner)
+- The configuration used for the pipeline is now saved in a separate spreadsheet in the `.xlsx` log file (#869 by @larsoner)
 
 [//]: # (### :warning: Behavior changes)
 
diff --git a/mne_bids_pipeline/_config_import.py b/mne_bids_pipeline/_config_import.py
@@ -28,7 +28,18 @@ def _import_config(
     """Import the default config and the user's config."""
     # Get the default
     config = _get_default_config()
+    # Public names users generally will have in their config
     valid_names = [d for d in dir(config) if not d.startswith("_")]
+    # Names that we will reduce the SimpleConfig to before returning
+    # (see _update_with_user_config)
+    keep_names = [d for d in dir(config) if not d.startswith("__")] + [
+        "config_path",
+        "PIPELINE_NAME",
+        "VERSION",
+        "CODE_URL",
+        "_raw_split_size",
+        "_epochs_split_size",
+    ]
 
     # Update with user config
     user_names = _update_with_user_config(
@@ -48,17 +59,21 @@ def _import_config(
             config_path=extra_config,
         )
         extra_exec_params_keys = ("_n_jobs",)
+    keep_names.extend(extra_exec_params_keys)
 
     # Check it
     if check:
         _check_config(config, config_path)
         _check_misspellings_removals(
-            config,
             valid_names=valid_names,
             user_names=user_names,
             log=log,
+            config_validation=config.config_validation,
         )
 
+    # Finally, reduce to our actual supported params (all keep_names should be present)
+    config = SimpleNamespace(**{k: getattr(config, k) for k in keep_names})
+
     # Take some standard actions
     mne.set_log_level(verbose=config.mne_log_level.upper())
 
@@ -406,11 +421,11 @@ def _pydantic_validate(
 
 
 def _check_misspellings_removals(
-    config: SimpleNamespace,
     *,
     valid_names: list[str],
     user_names: list[str],
     log: bool,
+    config_validation: str,
 ) -> None:
     # for each name in the user names, check if it's in the valid names but
     # the correct one is not defined
@@ -427,7 +442,7 @@ def _check_misspellings_removals(
                     "the variable to reduce ambiguity and avoid this message, "
                     "or set config.config_validation to 'warn' or 'ignore'."
                 )
-                _handle_config_error(this_msg, log, config)
+                _handle_config_error(this_msg, log, config_validation)
             if user_name in _REMOVED_NAMES:
                 new = _REMOVED_NAMES[user_name]["new_name"]
                 if new not in user_names:
@@ -438,16 +453,16 @@ def _check_misspellings_removals(
                         f"{msg} this variable has been removed as a valid "
                         f"config option, {instead}."
                     )
-                    _handle_config_error(this_msg, log, config)
+                    _handle_config_error(this_msg, log, config_validation)
 
 
 def _handle_config_error(
     msg: str,
     log: bool,
-    config: SimpleNamespace,
+    config_validation: str,
 ) -> None:
-    if config.config_validation == "raise":
+    if config_validation == "raise":
         raise ValueError(msg)
-    elif config.config_validation == "warn":
+    elif config_validation == "warn":
         if log:
             logger.warning(**gen_log_kwargs(message=msg, emoji="🛟"))
diff --git a/mne_bids_pipeline/_run.py b/mne_bids_pipeline/_run.py
@@ -4,7 +4,6 @@
 import functools
 import hashlib
 import inspect
-import json
 import pathlib
 import pdb
 import sys
@@ -38,14 +37,10 @@ def __mne_bids_pipeline_failsafe_wrapper__(*args, **kwargs):
                 get_input_fnames=get_input_fnames,
                 get_output_fnames=get_output_fnames,
             )
-            kwargs_copy = copy.deepcopy(kwargs)
             t0 = time.time()
-            kwargs_copy["cfg"] = json_tricks.dumps(
-                kwargs_copy["cfg"], sort_keys=False, indent=4
-            )
             log_info = pd.concat(
                 [
-                    pd.Series(kwargs_copy, dtype=object),
+                    pd.Series(kwargs, dtype=object),
                     pd.Series(index=["time", "success", "error_message"], dtype=object),
                 ]
             )
@@ -58,10 +53,10 @@ def __mne_bids_pipeline_failsafe_wrapper__(*args, **kwargs):
                 log_info["error_message"] = ""
             except Exception as e:
                 # Only keep what gen_log_kwargs() can handle
-                kwargs_copy = {
-                    k: v
-                    for k, v in kwargs_copy.items()
-                    if k in ("subject", "session", "task", "run")
+                kwargs_log = {
+                    k: kwargs[k]
+                    for k in ("subject", "session", "task", "run")
+                    if k in kwargs
                 }
                 message = (
                     f"A critical error occurred. " f"The error message was: {str(e)}"
@@ -88,13 +83,13 @@ def __mne_bids_pipeline_failsafe_wrapper__(*args, **kwargs):
                     if _is_testing():
                         raise
                     logger.error(
-                        **gen_log_kwargs(message=message, **kwargs_copy, emoji="❌")
+                        **gen_log_kwargs(message=message, **kwargs_log, emoji="❌")
                     )
                     sys.exit(1)
                 elif on_error == "debug":
                     message += "\n\nStarting post-mortem debugger."
                     logger.error(
-                        **gen_log_kwargs(message=message, **kwargs_copy, emoji="🐛")
+                        **gen_log_kwargs(message=message, **kwargs_log, emoji="🐛")
                     )
                     extype, value, tb = sys.exc_info()
                     print(tb)
@@ -103,7 +98,7 @@ def __mne_bids_pipeline_failsafe_wrapper__(*args, **kwargs):
                 else:
                     message += "\n\nContinuing pipeline run."
                     logger.error(
-                        **gen_log_kwargs(message=message, **kwargs_copy, emoji="🔂")
+                        **gen_log_kwargs(message=message, **kwargs_log, emoji="🔂")
                     )
             log_info["time"] = round(time.time() - t0, ndigits=1)
             return log_info
@@ -285,29 +280,8 @@ def save_logs(*, config: SimpleNamespace, logs: list[pd.Series]) -> None:
     sheet_name = _short_step_path(_get_step_path()).replace("/", "-")
     sheet_name = sheet_name[-30:]  # shorten due to limit of excel format
 
-    # We need to make the logs more compact to be able to write Excel format
-    # (32767 char limit per cell), in particular the "cfg" column has very large
-    # cells, so replace the "cfg" column with separated cfg.* columns (still truncated
-    # to the 32767 char limit)
-    compact_logs = list()
-    for log in logs:
-        log = log.copy()
-        # 1. Remove indentation (e.g., 220814 chars to 54416)
-        cfg = json.loads(log["cfg"])
-        del log["cfg"]
-        assert cfg["__instance_type__"] == ["types", "SimpleNamespace"], cfg[
-            "__instance_type__"
-        ]
-        for key, val in cfg["attributes"].items():
-            if isinstance(val, dict) and list(val.keys()) == ["__pathlib__"]:
-                val = val["__pathlib__"]
-            val = json.dumps(val, separators=(",", ":"))
-            if len(val) > 32767:
-                val = val[:32765] + " …"
-            log[f"cfg.{key}"] = val
-        compact_logs.append(log)
-    df = pd.DataFrame(compact_logs)
-    del logs, compact_logs
+    df = pd.DataFrame(logs)
+    del logs
 
     with FileLock(fname.with_suffix(fname.suffix + ".lock")):
         append = fname.exists()
@@ -317,7 +291,27 @@ def save_logs(*, config: SimpleNamespace, logs: list[pd.Series]) -> None:
             mode="a" if append else "w",
             if_sheet_exists="replace" if append else None,
         )
+        assert isinstance(config, SimpleNamespace), type(config)
+        cf_df = dict()
+        for key, val in config.__dict__.items():
+            # We need to be careful about functions, json_tricks does not work with them
+            if inspect.isfunction(val):
+                new_val = ""
+                if func_file := inspect.getfile(val):
+                    new_val += f"{func_file}:"
+                if getattr(val, "__qualname__", None):
+                    new_val += val.__qualname__
+                val = "custom callable" if not new_val else new_val
+            val = json_tricks.dumps(val, indent=4, sort_keys=False)
+            # 32767 char limit per cell (could split over lines but if something is
+            # this long, you'll probably get the gist from the first 32k chars)
+            if len(val) > 32767:
+                val = val[:32765] + " …"
+            cf_df[key] = val
+        cf_df = pd.DataFrame([cf_df], dtype=object)
         with writer:
+            # Config first then the data
+            cf_df.to_excel(writer, sheet_name="config", index=False)
             df.to_excel(writer, sheet_name=sheet_name, index=False)