[ENH/CLN] Refactor probabilistic workflow and add validations

Leguark · Leguark · commit 6e8a113b94a0 · 2025-06-12T08:26:22.000Z
Refactored probabilistic model setup by modularizing `_prob_run` for clarity and reusability. Added posterior mean value checks, MCMC convergence diagnostics, and cleaned redundant comments to enhance readability and test robustness.
diff --git a/gempy_probability/modules/model_definition/model_examples.py b/gempy_probability/modules/model_definition/model_examples.py
@@ -26,12 +26,16 @@ def model(geo_model: gempy.core.data.GeoModel, normal, y_obs_list):
 
     # * Update the model with the new top layer's location
     interpolation_input = interpolation_input_from_structural_frame(geo_model)
-    interpolation_input.surface_points.sp_coords = torch.index_put(
-        input=interpolation_input.surface_points.sp_coords,
-        indices=(torch.tensor([0]), torch.tensor([2])),
-        values=mu_top
-    )
-    # interpolation_input.surface_points.sp_coords[0, 2] = mu_top
+    
+    if False: # ?? I need to figure out if we need the index_put or not
+        indices__ = (torch.tensor([0]), torch.tensor([2]))  # * This has to be Tensors
+        interpolation_input.surface_points.sp_coords = torch.index_put(
+            input=interpolation_input.surface_points.sp_coords,
+            indices=indices__,
+            values=mu_top
+        )
+    else:
+        interpolation_input.surface_points.sp_coords[0, 2] = mu_top
 
     # endregion
 
@@ -50,13 +54,14 @@ def model(geo_model: gempy.core.data.GeoModel, normal, y_obs_list):
     thickness = simulated_well.sum()
     pyro.deterministic(
         name=r'$\mu_{thickness}$',
-        value=thickness.detach()
+        value=thickness.detach() # * This is only for az to track progress
     )
 
     # endregion
 
+    posterior_dist_normal = dist.Normal(thickness, 25)
     y_thickness = pyro.sample(
         name=r'$y_{thickness}$',
-        fn=dist.Normal(thickness, 25),
+        fn=posterior_dist_normal,
         obs=y_obs_list
     )
diff --git a/tests/test_prob_model/test_prob_I.py b/tests/test_prob_model/test_prob_I.py
@@ -1,7 +1,11 @@
+import numpy as np
 import os
+import pyro.distributions as dist
+import torch
+from pyro.distributions import TorchDistributionMixin
+
 import gempy as gp
-import gempy_engine
-import numpy as np
+from gempy_engine.core.backend_tensor import BackendTensor
 
 
 def test_basic_gempy_I() -> None:
@@ -20,68 +24,72 @@ def test_basic_gempy_I() -> None:
     # TODO: Convert this into an options preset
     geo_model.interpolation_options.uni_degree = 0
     geo_model.interpolation_options.mesh_extraction = False
-    geo_model.interpolation_options.sigmoid_slope = 1100.
+    geo_model.interpolation_options.sigmoid_slope = 1100. 
 
+    # region Minimal grid for the specific likelihood function
     x_loc = 6000
     y_loc = 0
     z_loc = np.linspace(0, 4000, 100)
     xyz_coord = np.array([[x_loc, y_loc, z] for z in z_loc])
     gp.set_custom_grid(geo_model.grid, xyz_coord=xyz_coord)
+    # endregion
 
     # TODO: Make sure only the custom grid ins active
-
+    
+    
     gp.compute_model(
         gempy_model=geo_model,
-        engine_config=gp.data.GemPyEngineConfig(backend=gp.data.AvailableBackends.numpy)
+        engine_config=gp.data.GemPyEngineConfig(
+            backend=gp.data.AvailableBackends.numpy
+        )
     )
-
-    from gempy_engine.core.backend_tensor import BackendTensor
-    BackendTensor.change_backend_gempy(engine_backend=gp.data.AvailableBackends.PYTORCH)
     
-    import pyro.distributions as dist
-    import torch
+    # TODO: Make this a more elegant way 
+    BackendTensor.change_backend_gempy(engine_backend=gp.data.AvailableBackends.PYTORCH)
 
     normal = dist.Normal(
         loc=(geo_model.surface_points_copy_transformed.xyz[0, 2]),
         scale=torch.tensor(0.1, dtype=torch.float64)
     )
-    # %%
-    # Running Prior Sampling and Visualization
-    # ----------------------------------------
-    # Prior sampling is an essential step in probabilistic modeling. 
-    # It helps in understanding the distribution of our prior assumptions before observing any data.
-
-    # %%
-    # Prepare observation data
-    import torch
-    y_obs_list = torch.tensor([200, 210, 190])
-
-    # %%
+
+    from gempy_probability.modules.model_definition.model_examples import model
+    _prob_run(
+        geo_model=geo_model,
+        prob_model=model,
+        normal=normal,
+        y_obs_list=torch.tensor([200, 210, 190])
+    )
+
+
+def _prob_run(geo_model: gp.data.GeoModel, prob_model: callable,
+              normal: TorchDistributionMixin, y_obs_list: torch.Tensor) -> None:
     # Run prior sampling and visualization
     from pyro.infer import Predictive
     import pyro
     import arviz as az
     import matplotlib.pyplot as plt
 
-    from gempy_probability.modules.model_definition.model_examples import model
+    from pyro.infer import NUTS
+    from pyro.infer import MCMC
+    from pyro.infer.autoguide import init_to_mean
+
+    # region prior sampling
     predictive = Predictive(
-        model=model,
+        model=prob_model,
         num_samples=50
     )
-
     prior = predictive(geo_model, normal, y_obs_list)
 
     data = az.from_pyro(prior=prior)
     az.plot_trace(data.prior)
     plt.show()
 
-    from pyro.infer import NUTS
-    from pyro.infer import MCMC
-    from pyro.infer.autoguide import init_to_mean
+    # endregion
 
+    # region inference
     pyro.primitives.enable_validation(is_validate=True)
     nuts_kernel = NUTS(
-        model,
+        prob_model,
         step_size=0.0085,
         adapt_step_size=True,
         target_accept_prob=0.9,
@@ -95,20 +103,24 @@ def test_basic_gempy_I() -> None:
         disable_validation=False
     )
     mcmc.run(geo_model, normal, y_obs_list)
-
     posterior_samples = mcmc.get_samples()
-    
     posterior_predictive_fn = Predictive(
-        model=model,
+        model=prob_model,
         posterior_samples=posterior_samples
     )
-
     posterior_predictive = posterior_predictive_fn(geo_model, normal, y_obs_list)
-
     data = az.from_pyro(posterior=mcmc, prior=prior, posterior_predictive=posterior_predictive)
+    # Test posterior mean values
+    posterior_top_mean = float(data.posterior[r'$\mu_{top}$'].mean())
+    assert 0.0070 < posterior_top_mean < 0.0071, f"Top layer mean {posterior_top_mean} outside expected range"
+    posterior_thickness_mean = float(data.posterior_predictive[r'$\mu_{thickness}$'].mean())
+    assert 220 < posterior_thickness_mean < 225, f"Thickness mean {posterior_thickness_mean} outside expected range"
+    # Test convergence diagnostics
+    assert float(data.sample_stats.diverging.sum()) == 0, "MCMC sampling has divergences"
+    # endregion
+
     az.plot_trace(data)
     plt.show()
-
     from gempy_probability.modules.plot.plot_posterior import default_red, default_blue
     az.plot_density(
         data=[data.posterior_predictive, data.prior_predictive],
@@ -118,7 +130,6 @@ def test_basic_gempy_I() -> None:
         colors=[default_red, default_blue],
     )
     plt.show()
-
     az.plot_density(
         data=[data, data.prior],
         shade=.9,