Skip to content

Commit ca8c4f2

Browse files
authored
Merge pull request #345 from NathanielF/iv_weak_instruments
Instrumental Variables - Justifying Instruments
2 parents 66096d6 + 9eb41f1 commit ca8c4f2

13 files changed

+6634
-21
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ repos:
2222
exclude_types: [svg]
2323
- id: check-yaml
2424
- id: check-added-large-files
25+
exclude: &exclude_pattern 'iv_weak_instruments.ipynb'
2526
args: ["--maxkb=1500"]
2627
- repo: https://github.com/astral-sh/ruff-pre-commit
2728
rev: v0.4.9

causalpy/data/datasets.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
"geolift1": {"filename": "geolift1.csv"},
3636
"risk": {"filename": "AJR2001.csv"},
3737
"nhefs": {"filename": "nhefs.csv"},
38+
"schoolReturns": {"filename": "schoolingReturns.csv"},
3839
}
3940

4041

causalpy/data/schoolingReturns.csv

Lines changed: 3011 additions & 0 deletions
Large diffs are not rendered by default.

causalpy/pymc_experiments.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1453,7 +1453,7 @@ def __init__(
14531453
"mus": [self.ols_beta_first_params, self.ols_beta_second_params],
14541454
"sigmas": [1, 1],
14551455
"eta": 2,
1456-
"lkj_sd": 2,
1456+
"lkj_sd": 1,
14571457
}
14581458
self.priors = priors
14591459
self.model.fit(

causalpy/pymc_models.py

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -303,8 +303,8 @@ class InstrumentalVariableRegression(ModelBuilder):
303303
... "mus": [[-2,4], [0.5, 3]],
304304
... "sigmas": [1, 1],
305305
... "eta": 2,
306-
... "lkj_sd": 2,
307-
... })
306+
... "lkj_sd": 1,
307+
... }, None)
308308
Inference data...
309309
"""
310310

@@ -340,7 +340,7 @@ def build_model(self, X, Z, y, t, coords, priors):
340340
sigma=priors["sigmas"][1],
341341
dims="covariates",
342342
)
343-
sd_dist = pm.HalfCauchy.dist(beta=priors["lkj_sd"], shape=2)
343+
sd_dist = pm.Exponential.dist(priors["lkj_sd"], shape=2)
344344
chol, corr, sigmas = pm.LKJCholeskyCov(
345345
name="chol_cov",
346346
eta=priors["eta"],
@@ -366,24 +366,52 @@ def build_model(self, X, Z, y, t, coords, priors):
366366
shape=(X.shape[0], 2),
367367
)
368368

369-
def fit(self, X, Z, y, t, coords, priors):
370-
"""Draw samples from posterior, prior predictive, and posterior predictive
371-
distributions.
369+
def sample_predictive_distribution(self, ppc_sampler="jax"):
370+
"""Function to sample the Multivariate Normal posterior predictive
371+
Likelihood term in the IV class. This can be slow without
372+
using the JAX sampler compilation method. If using the
373+
JAX sampler it will sample only the posterior predictive distribution.
374+
If using the PYMC sampler if will sample both the prior
375+
and posterior predictive distributions."""
376+
random_seed = self.sample_kwargs.get("random_seed", None)
377+
378+
if ppc_sampler == "jax":
379+
with self:
380+
self.idata.extend(
381+
pm.sample_posterior_predictive(
382+
self.idata,
383+
random_seed=random_seed,
384+
compile_kwargs={"mode": "JAX"},
385+
)
386+
)
387+
elif ppc_sampler == "pymc":
388+
with self:
389+
self.idata.extend(pm.sample_prior_predictive(random_seed=random_seed))
390+
self.idata.extend(
391+
pm.sample_posterior_predictive(
392+
self.idata,
393+
random_seed=random_seed,
394+
)
395+
)
396+
397+
def fit(self, X, Z, y, t, coords, priors, ppc_sampler=None):
398+
"""Draw samples from posterior distribution and potentially
399+
from the prior and posterior predictive distributions. The
400+
fit call can take values for the
401+
ppc_sampler = ['jax', 'pymc', None]
402+
We default to None, so the user can determine if they wish
403+
to spend time sampling the posterior predictive distribution
404+
independently.
372405
"""
373406

374407
# Ensure random_seed is used in sample_prior_predictive() and
375408
# sample_posterior_predictive() if provided in sample_kwargs.
376-
random_seed = self.sample_kwargs.get("random_seed", None)
409+
# Use JAX for ppc sampling of multivariate likelihood
377410

378411
self.build_model(X, Z, y, t, coords, priors)
379412
with self:
380413
self.idata = pm.sample(**self.sample_kwargs)
381-
self.idata.extend(pm.sample_prior_predictive(random_seed=random_seed))
382-
self.idata.extend(
383-
pm.sample_posterior_predictive(
384-
self.idata, progressbar=False, random_seed=random_seed
385-
)
386-
)
414+
self.sample_predictive_distribution(ppc_sampler=ppc_sampler)
387415
return self.idata
388416

389417

causalpy/tests/test_integration_pymc_examples.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,7 @@ def test_iv_reg():
504504
sample_kwargs=sample_kwargs
505505
),
506506
)
507+
result.model.sample_predictive_distribution(ppc_sampler="pymc")
507508
assert isinstance(df, pd.DataFrame)
508509
assert isinstance(data, pd.DataFrame)
509510
assert isinstance(instruments_data, pd.DataFrame)

docs/source/_static/interrogate_badge.svg

Lines changed: 3 additions & 3 deletions
Loading

docs/source/examples.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ Instrumental Variables Regression
6868
:titlesonly:
6969

7070
notebooks/iv_pymc.ipynb
71+
notebooks/iv_weak_instruments.ipynb
7172

7273
Inverse Propensity Score Weighting
7374
=================================

docs/source/glossary.rst

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ Glossary
4646
Endogenous Variable
4747
An endogenous variable is a variable in a regression equation such that the variable is correlated with the error term of the equation i.e. correlated with the outcome variable (in the system). This is a problem for OLS regression estimation techniques because endogeniety violates the assumptions of the Gauss Markov theorem.
4848

49+
Local Average Treatment effect
50+
LATE
51+
Also known asthe complier average causal effect (CACE), is the effect of a treatment for subjects who comply with the experimental treatment assigned to their sample group. It is the quantity we're estimating in IV designs.
52+
4953
Non-equivalent group designs
5054
NEGD
5155
A quasi-experimental design where units are assigned to conditions non-randomly, and not according to a running variable (see Regression discontinuity design). This can be problematic when assigning causal influence of the treatment - differences in outcomes between groups could be due to the treatment or due to differences in the group attributes themselves.
@@ -62,6 +66,9 @@ Glossary
6266
Pretest-posttest design
6367
A quasi-experimental design where the treatment effect is estimated by comparing an outcome measure before and after treatment.
6468

69+
Propensity scores
70+
An estimate of the probability of adopting a treatment status. Used in re-weighting schemes to balance observational data.
71+
6572
Quasi-experiment
6673
An empirical comparison used to estimate the effects of a treatment where units are not assigned to conditions at random.
6774

@@ -101,8 +108,6 @@ Glossary
101108
2SLS
102109
An estimation technique for estimating the parameters of an IV regression. It takes its name from the fact that it uses two OLS regressions - a first and second stage.
103110

104-
Propensity scores
105-
An estimate of the probability of adopting a treatment status. Used in re-weighting schemes to balance observational data.
106111

107112

108113
References

docs/source/notebooks/iv_weak_instruments.ipynb

Lines changed: 3556 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)