Merge pull request #345 from NathanielF/iv_weak_instruments

drbenvincent · web-flow · commit ca8c4f2f2fb4 · 2024-06-18T20:04:54.000+01:00
Instrumental Variables - Justifying Instruments
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,6 +22,7 @@ repos:
         exclude_types: [svg]
       - id: check-yaml
       - id: check-added-large-files
+        exclude: &exclude_pattern 'iv_weak_instruments.ipynb'
         args: ["--maxkb=1500"]
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.4.9
diff --git a/causalpy/data/datasets.py b/causalpy/data/datasets.py
@@ -35,6 +35,7 @@
     "geolift1": {"filename": "geolift1.csv"},
     "risk": {"filename": "AJR2001.csv"},
     "nhefs": {"filename": "nhefs.csv"},
+    "schoolReturns": {"filename": "schoolingReturns.csv"},
 }
 
 
diff --git a/causalpy/data/schoolingReturns.csv b/causalpy/data/schoolingReturns.csv
diff --git a/causalpy/pymc_experiments.py b/causalpy/pymc_experiments.py
@@ -1453,7 +1453,7 @@ def __init__(
                 "mus": [self.ols_beta_first_params, self.ols_beta_second_params],
                 "sigmas": [1, 1],
                 "eta": 2,
-                "lkj_sd": 2,
+                "lkj_sd": 1,
             }
         self.priors = priors
         self.model.fit(
diff --git a/causalpy/pymc_models.py b/causalpy/pymc_models.py
@@ -303,8 +303,8 @@ class InstrumentalVariableRegression(ModelBuilder):
     ...                  "mus": [[-2,4], [0.5, 3]],
     ...                  "sigmas": [1, 1],
     ...                  "eta": 2,
-    ...                  "lkj_sd": 2,
-    ...              })
+    ...                  "lkj_sd": 1,
+    ...              }, None)
     Inference data...
     """
 
@@ -340,7 +340,7 @@ def build_model(self, X, Z, y, t, coords, priors):
                 sigma=priors["sigmas"][1],
                 dims="covariates",
             )
-            sd_dist = pm.HalfCauchy.dist(beta=priors["lkj_sd"], shape=2)
+            sd_dist = pm.Exponential.dist(priors["lkj_sd"], shape=2)
             chol, corr, sigmas = pm.LKJCholeskyCov(
                 name="chol_cov",
                 eta=priors["eta"],
@@ -366,24 +366,52 @@ def build_model(self, X, Z, y, t, coords, priors):
                 shape=(X.shape[0], 2),
             )
 
-    def fit(self, X, Z, y, t, coords, priors):
-        """Draw samples from posterior, prior predictive, and posterior predictive
-        distributions.
+    def sample_predictive_distribution(self, ppc_sampler="jax"):
+        """Function to sample the Multivariate Normal posterior predictive
+        Likelihood term in the IV class. This can be slow without
+        using the JAX sampler compilation method. If using the
+        JAX sampler it will sample only the posterior predictive distribution.
+        If using the PYMC sampler if will sample both the prior
+        and posterior predictive distributions."""
+        random_seed = self.sample_kwargs.get("random_seed", None)
+
+        if ppc_sampler == "jax":
+            with self:
+                self.idata.extend(
+                    pm.sample_posterior_predictive(
+                        self.idata,
+                        random_seed=random_seed,
+                        compile_kwargs={"mode": "JAX"},
+                    )
+                )
+        elif ppc_sampler == "pymc":
+            with self:
+                self.idata.extend(pm.sample_prior_predictive(random_seed=random_seed))
+                self.idata.extend(
+                    pm.sample_posterior_predictive(
+                        self.idata,
+                        random_seed=random_seed,
+                    )
+                )
+
+    def fit(self, X, Z, y, t, coords, priors, ppc_sampler=None):
+        """Draw samples from posterior distribution and potentially
+        from the prior and posterior predictive distributions. The
+        fit call can take values for the
+        ppc_sampler = ['jax', 'pymc', None]
+        We default to None, so the user can determine if they wish
+        to spend time sampling the posterior predictive distribution
+        independently.
         """
 
         # Ensure random_seed is used in sample_prior_predictive() and
         # sample_posterior_predictive() if provided in sample_kwargs.
-        random_seed = self.sample_kwargs.get("random_seed", None)
+        # Use JAX for ppc sampling of multivariate likelihood
 
         self.build_model(X, Z, y, t, coords, priors)
         with self:
             self.idata = pm.sample(**self.sample_kwargs)
-            self.idata.extend(pm.sample_prior_predictive(random_seed=random_seed))
-            self.idata.extend(
-                pm.sample_posterior_predictive(
-                    self.idata, progressbar=False, random_seed=random_seed
-                )
-            )
+        self.sample_predictive_distribution(ppc_sampler=ppc_sampler)
         return self.idata
 
 
diff --git a/causalpy/tests/test_integration_pymc_examples.py b/causalpy/tests/test_integration_pymc_examples.py
@@ -504,6 +504,7 @@ def test_iv_reg():
             sample_kwargs=sample_kwargs
         ),
     )
+    result.model.sample_predictive_distribution(ppc_sampler="pymc")
     assert isinstance(df, pd.DataFrame)
     assert isinstance(data, pd.DataFrame)
     assert isinstance(instruments_data, pd.DataFrame)
diff --git a/docs/source/_static/interrogate_badge.svg b/docs/source/_static/interrogate_badge.svg
@@ -1,5 +1,5 @@
 <svg width="140" height="20" viewBox="0 0 140 20" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" xmlns:serif="http://www.serif.com/" style="fill-rule:evenodd;clip-rule:evenodd;stroke-linejoin:round;stroke-miterlimit:2;">
-    <title>interrogate: 94.2%</title>
+    <title>interrogate: 94.3%</title>
     <g transform="matrix(1,0,0,1,22,0)">
         <g id="backgrounds" transform="matrix(1.32789,0,0,1,-22.3892,0)">
             <rect x="0" y="0" width="71" height="20" style="fill:rgb(85,85,85);"/>
@@ -12,8 +12,8 @@
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="110">
         <text x="590" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="610">interrogate</text>
         <text x="590" y="140" transform="scale(.1)" textLength="610">interrogate</text>
-        <text x="1160" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="370" data-interrogate="result">94.2%</text>
-        <text x="1160" y="140" transform="scale(.1)" textLength="370" data-interrogate="result">94.2%</text>
+        <text x="1160" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="370" data-interrogate="result">94.3%</text>
+        <text x="1160" y="140" transform="scale(.1)" textLength="370" data-interrogate="result">94.3%</text>
     </g>
     <g id="logo-shadow" serif:id="logo shadow" transform="matrix(0.854876,0,0,0.854876,-6.73514,1.732)">
         <g transform="matrix(0.299012,0,0,0.299012,9.70229,-6.68582)">
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -68,6 +68,7 @@ Instrumental Variables Regression
    :titlesonly:
 
    notebooks/iv_pymc.ipynb
+   notebooks/iv_weak_instruments.ipynb
 
 Inverse Propensity Score Weighting
 =================================
diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst
@@ -46,6 +46,10 @@ Glossary
    Endogenous Variable
       An endogenous variable is a variable in a regression equation such that the variable is correlated with the error term of the equation i.e. correlated with the outcome variable (in the system). This is a problem for OLS regression estimation techniques because endogeniety violates the assumptions of the Gauss Markov theorem.
 
+   Local Average Treatment effect
+   LATE
+      Also known asthe complier average causal effect (CACE), is the effect of a treatment for subjects who comply with the experimental treatment assigned to their sample group. It is the quantity we're estimating in IV designs.
+
    Non-equivalent group designs
    NEGD
       A quasi-experimental design where units are assigned to conditions non-randomly, and not according to a running variable (see Regression discontinuity design). This can be problematic when assigning causal influence of the treatment - differences in outcomes between groups could be due to the treatment or due to differences in the group attributes themselves.
@@ -62,6 +66,9 @@ Glossary
    Pretest-posttest design
       A quasi-experimental design where the treatment effect is estimated by comparing an outcome measure before and after treatment.
 
+   Propensity scores
+      An estimate of the probability of adopting a treatment status. Used in re-weighting schemes to balance observational data.
+
    Quasi-experiment
       An empirical comparison used to estimate the effects of a treatment where units are not assigned to conditions at random.
 
@@ -101,8 +108,6 @@ Glossary
    2SLS
       An estimation technique for estimating the parameters of an IV regression. It takes its name from the fact that it uses two OLS regressions - a first and second stage.
 
-   Propensity scores
-      An estimate of the probability of adopting a treatment status. Used in re-weighting schemes to balance observational data.
 
 
 References
diff --git a/docs/source/notebooks/iv_weak_instruments.ipynb b/docs/source/notebooks/iv_weak_instruments.ipynb
diff --git a/docs/source/quasi_dags.ipynb b/docs/source/quasi_dags.ipynb
@@ -349,7 +349,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "One nice feature of this set up is that we can evaluate the claim of __strong ignorability__ because it implies that  $T  \\perp\\!\\!\\!\\perp  X | PS(X)$ and this ensures the covariate profiles are balanced across the treatment branches conditional on the propensity score. This is a testable implication of the postulated design! Balance plots and measures are ways in which to evaluate if the offset achieved by your propensity score has worked. It is crucial that PS serve as a balancing score, if the measure cannot serve as a balancing score the collision effect can add to the confounding bias rather than remove it. "
+    "One nice feature of this set up is that we can evaluate the claim of __strong ignorability__ because it implies that  $Z  \\perp\\!\\!\\!\\perp  X | PS(X)$ and this ensures the covariate profiles are balanced across the treatment branches conditional on the propensity score. This is a testable implication of the postulated design! Balance plots and measures are ways in which to evaluate if the offset achieved by your propensity score has worked. It is crucial that PS serve as a balancing score, if the measure cannot serve as a balancing score the collision effect can add to the confounding bias rather than remove it. "
    ]
   },
   {
diff --git a/docs/source/references.bib b/docs/source/references.bib
@@ -76,6 +76,15 @@ @article{acemoglu2001colonial
   year={2001}
 }
 
+@incollection{card1995returns,
+  author={Card, David},
+  title={Using Geographical Variation in College Proximity to Estimate the Return to Schooling},
+  editor={Christofides, L.N. and Grant, E.K. and Swidinsky, R.},
+  booktitle={Aspects of Labour Market Behaviour: Essays in Honour of John Vanderkamp},
+  year={1995},
+  publisher={University of Toronto Press}
+}
+
 @incollection{forde2024nonparam,
   author    = {Forde, Nathaniel},
   title     = {Bayesian Non-parametric Causal Inference},
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,7 +39,7 @@ dependencies = [
     "scipy",
     "seaborn>=0.11.2",
     "statsmodels",
-    "xarray>=v2022.11.0",
+    "xarray>=v2022.11.0"
 ]
 
 # List additional groups of dependencies here (e.g. development dependencies). Users

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@`
`35`	`35`	`"geolift1": {"filename": "geolift1.csv"},`
`36`	`36`	`"risk": {"filename": "AJR2001.csv"},`
`37`	`37`	`"nhefs": {"filename": "nhefs.csv"},`
	`38`	`+ "schoolReturns": {"filename": "schoolingReturns.csv"},`
`38`	`39`	`}`
`39`	`40`
`40`	`41`
Original file line number	Diff line number	Diff line change
`@@ -1453,7 +1453,7 @@ def __init__(`
`1453`	`1453`	`"mus": [self.ols_beta_first_params, self.ols_beta_second_params],`
`1454`	`1454`	`"sigmas": [1, 1],`
`1455`	`1455`	`"eta": 2,`
`1456`		`- "lkj_sd": 2,`
	`1456`	`+ "lkj_sd": 1,`
`1457`	`1457`	`}`
`1458`	`1458`	`self.priors = priors`
`1459`	`1459`	`self.model.fit(`
Original file line number	Diff line number	Diff line change
`@@ -504,6 +504,7 @@ def test_iv_reg():`
`504`	`504`	`sample_kwargs=sample_kwargs`
`505`	`505`	`),`
`506`	`506`	`)`
	`507`	`+ result.model.sample_predictive_distribution(ppc_sampler="pymc")`
`507`	`508`	`assert isinstance(df, pd.DataFrame)`
`508`	`509`	`assert isinstance(data, pd.DataFrame)`
`509`	`510`	`assert isinstance(instruments_data, pd.DataFrame)`