Merge pull request #338 from Climate-REF/151-specified-solve

lewisjared · web-flow · commit 5a379954fc68 · 2025-05-27T14:17:00.000+10:00
diff --git a/changelog/338.feature.md b/changelog/338.feature.md
@@ -0,0 +1,4 @@
+Adds `--diagnostic` and `--provider` arguments to the `ref solve` command.
+This allows users to subset a specific diagnostic or provider that they wish to run.
+Multiple `--diagnostic` or `--provider` arguments can be used to specify multiple diagnostics or providers.
+The diagnostic or provider slug must contain one of the filter values to be included in the calculations.
diff --git a/packages/climate-ref/src/climate_ref/cli/solve.py b/packages/climate-ref/src/climate_ref/cli/solve.py
@@ -1,21 +1,48 @@
+from typing import Annotated
+
 import typer
 
-from climate_ref.solver import solve_required_executions
+from climate_ref.solver import SolveFilterOptions, solve_required_executions
 
 app = typer.Typer()
 
 
 @app.command()
-def solve(
+def solve(  # noqa: PLR0913
     ctx: typer.Context,
-    dry_run: bool = typer.Option(False, help="Do not execute any diagnostics"),
+    dry_run: Annotated[
+        bool,
+        typer.Option(help="Do not execute any diagnostics"),
+    ] = False,
+    execute: Annotated[
+        bool,
+        typer.Option(help="Solve the newly identified executions"),
+    ] = True,
     timeout: int = typer.Option(60, help="Timeout in seconds for the solve operation"),
     one_per_provider: bool = typer.Option(
         False, help="Limit to one execution per provider. This is useful for testing"
     ),
     one_per_diagnostic: bool = typer.Option(
         False, help="Limit to one execution per diagnostic. This is useful for testing"
     ),
+    diagnostic: Annotated[
+        list[str] | None,
+        typer.Option(
+            help="Filters executions by the diagnostic slug. "
+            "Diagnostics will be included if any of the filters match a case-insensitive subset "
+            "of the diagnostic slug. "
+            "Multiple values can be provided"
+        ),
+    ] = None,
+    provider: Annotated[
+        list[str] | None,
+        typer.Option(
+            help="Filters executions by provider slug. "
+            "Providers will be included if any of the filters match a case-insensitive subset "
+            "of the provider slug. "
+            "Multiple values can be provided"
+        ),
+    ] = None,
 ) -> None:
     """
     Solve for executions that require recalculation
@@ -25,11 +52,19 @@ def solve(
     """
     config = ctx.obj.config
     db = ctx.obj.database
+
+    filters = SolveFilterOptions(
+        diagnostic=diagnostic,
+        provider=provider,
+    )
+
     solve_required_executions(
         config=config,
         db=db,
         dry_run=dry_run,
+        execute=execute,
         timeout=timeout,
         one_per_provider=one_per_provider,
         one_per_diagnostic=one_per_diagnostic,
+        filters=filters,
     )
diff --git a/packages/climate-ref/src/climate_ref/solver.py b/packages/climate-ref/src/climate_ref/solver.py
@@ -245,6 +245,57 @@ def _solve_from_data_requirements(
         )
 
 
+@define
+class SolveFilterOptions:
+    """
+    Options to filter the diagnostics that are solved
+    """
+
+    diagnostic: list[str] | None = None
+    """
+    Check if the diagnostic slug contains any of the provided values
+    """
+    provider: list[str] | None = None
+    """
+    Check if the provider slug contains any of the provided values
+    """
+
+
+def matches_filter(diagnostic: Diagnostic, filters: SolveFilterOptions | None) -> bool:
+    """
+    Check if a diagnostic matches the provided filters
+
+    Each filter is optional and a diagnostic will match if it satisfies all the provided filters.
+    i.e. the filters are ANDed together.
+
+    Parameters
+    ----------
+    diagnostic
+        Diagnostic to check against the filters
+    filters
+        Collection of filters to apply to the diagnostic
+
+        If no filters are provided, the diagnostic is considered to match
+
+    Returns
+    -------
+        True if the diagnostic matches the filters, False otherwise
+    """
+    if filters is None:
+        return True
+
+    diagnostic_slug = diagnostic.slug
+    provider_slug = diagnostic.provider.slug
+
+    if filters.provider and not any([f.lower() in provider_slug for f in filters.provider]):
+        return False
+
+    if filters.diagnostic and not any([f.lower() in diagnostic_slug for f in filters.diagnostic]):
+        return False
+
+    return True
+
+
 @define
 class ExecutionSolver:
     """
@@ -278,7 +329,9 @@ def build_from_db(config: Config, db: Database) -> "ExecutionSolver":
             },
         )
 
-    def solve(self) -> typing.Generator[DiagnosticExecution, None, None]:
+    def solve(
+        self, filters: SolveFilterOptions | None = None
+    ) -> typing.Generator[DiagnosticExecution, None, None]:
         """
         Solve which executions need to be calculated for a dataset
 
@@ -293,17 +346,23 @@ def solve(self) -> typing.Generator[DiagnosticExecution, None, None]:
         """
         for provider in self.provider_registry.providers:
             for diagnostic in provider.diagnostics():
+                # Filter the diagnostic based on the provided filters
+                if not matches_filter(diagnostic, filters):
+                    logger.debug(f"Skipping {diagnostic.full_slug()} due to filter")
+                    continue
                 yield from solve_executions(self.data_catalog, diagnostic, provider)
 
 
 def solve_required_executions(  # noqa: PLR0913
     db: Database,
     dry_run: bool = False,
+    execute: bool = True,
     solver: ExecutionSolver | None = None,
     config: Config | None = None,
     timeout: int = 60,
     one_per_provider: bool = False,
     one_per_diagnostic: bool = False,
+    filters: SolveFilterOptions | None = None,
 ) -> None:
     """
     Solve for executions that require recalculation
@@ -328,7 +387,7 @@ def solve_required_executions(  # noqa: PLR0913
     diagnostic_count = {}
     provider_count = {}
 
-    for potential_execution in solver.solve():
+    for potential_execution in solver.solve(filters):
         # The diagnostic output is first written to the scratch directory
         definition = potential_execution.build_execution_definition(output_root=config.paths.scratch)
 
@@ -371,6 +430,7 @@ def solve_required_executions(  # noqa: PLR0913
                 logger.info(f"Created new execution group: {potential_execution.execution_slug()!r}")
                 db.session.flush()
 
+            # TODO: Move this logic to the solver
             # Check if we should run given the one_per_provider or one_per_diagnostic flags
             one_of_check_failed = (
                 one_per_provider and provider_count.get(diagnostic.provider.slug, 0) > 0
@@ -403,10 +463,11 @@ def solve_required_executions(  # noqa: PLR0913
                 # Add links to the datasets used in the execution
                 execution.register_datasets(db, definition.datasets)
 
-                executor.run(
-                    definition=definition,
-                    execution=execution,
-                )
+                if execute:
+                    executor.run(
+                        definition=definition,
+                        execution=execution,
+                    )
 
                 provider_count[diagnostic.provider.slug] += 1
                 diagnostic_count[diagnostic.full_slug()] += 1
diff --git a/packages/climate-ref/tests/unit/cli/test_solve.py b/packages/climate-ref/tests/unit/cli/test_solve.py
@@ -14,6 +14,9 @@ def test_solve(self, sample_data_dir, db, invoke_cli, mocker):
 
         assert kwargs["timeout"] == 60
         assert not kwargs["dry_run"]
+        assert kwargs["execute"]
+        assert kwargs["filters"].diagnostic is None
+        assert kwargs["filters"].provider is None
 
     def test_solve_with_timeout(self, sample_data_dir, db, invoke_cli, mocker):
         mock_solve = mocker.patch("climate_ref.cli.solve.solve_required_executions")
@@ -28,3 +31,21 @@ def test_solve_with_dryrun(self, sample_data_dir, db, invoke_cli, mocker):
 
         args, kwargs = mock_solve.call_args
         assert kwargs["dry_run"]
+
+    def test_solve_with_filters(self, sample_data_dir, db, invoke_cli, mocker):
+        mock_solve = mocker.patch("climate_ref.cli.solve.solve_required_executions")
+        invoke_cli(
+            [
+                "solve",
+                "--diagnostic",
+                "global-mean-timeseries",
+                "--provider",
+                "esmvaltool",
+                "--provider",
+                "ilamb",
+            ]
+        )
+
+        args, kwargs = mock_solve.call_args
+        assert kwargs["filters"].diagnostic == ["global-mean-timeseries"]
+        assert kwargs["filters"].provider == ["esmvaltool", "ilamb"]
diff --git a/packages/climate-ref/tests/unit/test_solver.py b/packages/climate-ref/tests/unit/test_solver.py
@@ -12,6 +12,7 @@
 from climate_ref.solver import (
     DiagnosticExecution,
     ExecutionSolver,
+    SolveFilterOptions,
     extract_covered_datasets,
     solve_executions,
     solve_required_executions,
@@ -31,6 +32,19 @@ def solver(db_seeded, config) -> ExecutionSolver:
     return metric_solver
 
 
+@pytest.fixture
+def aft_solver(db_seeded, config) -> ExecutionSolver:
+    from climate_ref_esmvaltool import provider as esmvaltool_provider
+    from climate_ref_ilamb import provider as ilamb_provider
+    from climate_ref_pmp import provider as pmp_provider
+
+    registry = ProviderRegistry(providers=[pmp_provider, esmvaltool_provider, ilamb_provider])
+    metric_solver = ExecutionSolver.build_from_db(config, db_seeded)
+    metric_solver.provider_registry = registry
+
+    return metric_solver
+
+
 @pytest.fixture
 def mock_metric_execution(
     tmp_path, db_seeded, definition_factory, mock_diagnostic, provider
@@ -289,6 +303,69 @@ def test_extract_no_groups():
         extract_covered_datasets(data_catalog, requirement)
 
 
+def test_solver_solve_with_filters(aft_solver):
+    def solve_filtered(**kwargs):
+        """Helper function to solve with filters and return a DataFrame of results."""
+        return pd.DataFrame(
+            [
+                {
+                    "diagnostic": execution.diagnostic.slug,
+                    "provider": execution.provider.slug,
+                    "dataset_key": execution.dataset_key,
+                }
+                for execution in aft_solver.solve(filters=SolveFilterOptions(**kwargs))
+            ]
+        )
+
+    # Empty filters should return all executions
+    executions = solve_filtered()
+    assert not executions.empty
+    executions = solve_filtered(provider=None, diagnostic=None)
+    assert not executions.empty
+    executions = solve_filtered(provider=[], diagnostic=[])
+    assert not executions.empty
+
+    # ILAMB filter should only return ILAMB executions
+    executions = solve_filtered(provider=["ilamb"])
+    assert executions["provider"].unique().tolist() == ["ilamb"]
+    assert executions["diagnostic"].nunique() > 1
+
+    # Multiple provider filters
+    executions = solve_filtered(provider=["ilamb", "pmp"])
+    assert sorted(executions["provider"].unique().tolist()) == ["ilamb", "pmp"]
+
+    # Partial diagnostic filter should return executions for that diagnostic
+    # enso metrics exist in both pmp and esmvaltool providers
+    executions = solve_filtered(diagnostic=["enso"])
+    assert sorted(executions["provider"].unique().tolist()) == ["esmvaltool", "pmp"]
+
+    # Adding in a provider filter as well should limit the results to that provider
+    executions = solve_filtered(provider=["pmp"], diagnostic=["enso"])
+    assert executions["provider"].unique().tolist() == ["pmp"]
+    assert sorted(executions["diagnostic"].unique().tolist()) == ["enso_proc", "enso_tel"]
+
+    # Check lowercase
+    pd.testing.assert_frame_equal(executions, solve_filtered(provider=["PmP"], diagnostic=["enSo"]))
+
+    # Missing provider should return no results
+    assert not list(
+        aft_solver.solve(
+            filters=SolveFilterOptions(
+                provider=["missing"],
+            )
+        )
+    )
+
+    # Missing diagnostic should return no results
+    assert not list(
+        aft_solver.solve(
+            filters=SolveFilterOptions(
+                diagnostic=["missing"],
+            )
+        )
+    )
+
+
 def test_solve_metrics_default_solver(mocker, mock_metric_execution, mock_executor, db_seeded, solver):
     mock_build_solver = mocker.patch.object(ExecutionSolver, "build_from_db")