Skip to content

Commit 52ce543

Browse files
authored
Merge pull request #229 from stan-dev/case-study/sum-to-zero
sum_to_zero_vector case study
2 parents 0743110 + 987aeb3 commit 52ce543

24 files changed

+5054
-0
lines changed

jupyter/radon/LICENSE

Lines changed: 354 additions & 0 deletions
Large diffs are not rendered by default.

jupyter/radon/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Notebook based on Gelman and Hill 2007, Radon case study.
2+
Introduces hierarchical models and Python packages `cmdstanpy`, and `plotnine`.
3+
4+
Author: Mitzi Morris
5+
6+

jupyter/sum-to-zero/LICENSE

Lines changed: 354 additions & 0 deletions
Large diffs are not rendered by default.

jupyter/sum-to-zero/README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
Case study and jupyter notebook which demonstrate the correctness and efficiency of the
2+
`sum_to_zero_vector` constrained parameter type, introduced in Stan 2.36.
3+
4+
5+
- Case study `sum_to_zero_evaluation.qmd` demonstrates use of the `sum_to_zero_vector` for 2 models.
6+
7+
- Jupyter notebook `sum_to_zero_evalutation.ipynb` is a step-by-step explanation of the operations
8+
used to carry out this evaluation.
9+
10+
Included in the GitHub repository for this notebook are several python files of helper functions.
11+
12+
* eval_efficiencies.py - run models repeatedly, get average performance stats.
13+
* utils.py - simulate data for binomial model.
14+
* utils\_html.py - format Stan summaries for this notebook.
15+
* utils\_bym2.py - compute data inputs to the BYM2 model.
16+
* utils\_nyc\_map.py - munge the New York City census tract map.
17+
18+
Author: Mitzi Morris
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"runtime, ave":{"ozs large":2.41,"hard large":3.25,"soft large":17.41},"runtime, std dev":{"ozs large":0.08,"hard large":0.09,"soft large":0.5},"ESS_bulk\/s":{"ozs large":1305.86,"hard large":981.53,"soft large":181.15}}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"runtime, ave":{"ozs small":2.21,"hard small":3.07,"soft small":54.49},"runtime, std dev":{"ozs small":0.05,"hard small":0.08,"soft small":2.54},"ESS_bulk\/s":{"ozs small":1327.11,"hard small":988.47,"soft small":63.14}}

jupyter/sum-to-zero/data/nyc_study.geojson

Lines changed: 2102 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
# Compare ESS/sec using binomial model
2+
# and simulated datasets based on smaller, larger number of observations.
3+
import os
4+
import numpy as np
5+
import pandas as pd
6+
from typing import Any, Dict, List, Tuple
7+
from cmdstanpy import CmdStanModel, set_cmdstan_path
8+
9+
10+
import logging
11+
cmdstanpy_logger = logging.getLogger("cmdstanpy")
12+
cmdstanpy_logger.setLevel(logging.FATAL)
13+
14+
import warnings
15+
warnings.filterwarnings('ignore')
16+
17+
set_cmdstan_path(os.path.join('/Users', 'mitzi', 'github', 'stan-dev', 'cmdstan'))
18+
19+
from utils import simulate_data
20+
21+
# Fit model and dataset for N iterations.
22+
# For each run, save wall clock time and effective samples / second (N_Eff/sec)
23+
# Return np.ndarray of size (N,2) with timing information.
24+
def time_fits(N: int, model: CmdStanModel, data: dict) -> np.ndarray:
25+
fit_times = np.ndarray(shape=(N, 2), dtype=float)
26+
for i in range(N):
27+
print('Run', i)
28+
fit = model.sample(data=data, parallel_chains=4,
29+
show_progress=False, show_console=False, refresh=10_000)
30+
fit_summary = fit.summary()
31+
total_time = 0
32+
times = fit.time
33+
for j in range(len(times)):
34+
total_time += times[j]['total']
35+
36+
fit_times[i, 0] = total_time
37+
fit_times[i, 1] = fit_summary.loc['lp__', 'ESS_bulk/s']
38+
return fit_times
39+
40+
41+
# Given a list of label, time pairs, populate dataframe
42+
# of means of time and std dev wall clock time, and N_Eff/sec
43+
def summarize_times(data_pairs: List[Tuple[str, np.ndarray]]) -> pd.DataFrame:
44+
result_data = []
45+
for label, array in data_pairs:
46+
result_data.append({
47+
'label': label,
48+
'mean': np.mean(array, axis=0)[0],
49+
'std dev': np.std(array, axis=0)[0],
50+
'ESS_bulk/s': np.mean(array, axis=0)[1]
51+
})
52+
df = pd.DataFrame(result_data)
53+
return df.set_index('label').round(2)
54+
55+
56+
# Create datasets - fix sizes, and seed
57+
N_eth = 3
58+
N_edu = 5
59+
N_age = 9
60+
baseline = -3.5
61+
sens = 0.75
62+
spec = 0.9995
63+
data_small = simulate_data(N_eth, N_edu, N_age, baseline, sens, spec, 17, seed=45678)
64+
data_large = simulate_data(N_eth, N_edu, N_age, baseline, sens, spec, 200, seed=45678)
65+
66+
# sum to zero vector
67+
68+
binomial_ozs_mod = CmdStanModel(stan_file=os.path.join('stan', 'binomial_4preds_ozs.stan'))
69+
times_ozs_large = time_fits(100, binomial_ozs_mod, data_large)
70+
times_ozs_small = time_fits(100, binomial_ozs_mod, data_small)
71+
72+
# hard sum-to-zero constraint
73+
74+
binomial_hard_mod = CmdStanModel(stan_file=os.path.join('stan', 'binomial_4preds_hard.stan'))
75+
times_hard_small = time_fits(100, binomial_hard_mod, data_small)
76+
times_hard_large = time_fits(100, binomial_hard_mod, data_large)
77+
78+
# soft sum-to-zero constraint
79+
80+
binomial_soft_mod = CmdStanModel(stan_file=os.path.join('stan', 'binomial_4preds_soft.stan'))
81+
times_soft_small = time_fits(100, binomial_soft_mod, data_small)
82+
times_soft_large = time_fits(100, binomial_soft_mod, data_large)
83+
84+
85+
df_small = summarize_times([('ozs small', times_ozs_small),
86+
('hard small', times_hard_small),
87+
('soft small', times_soft_small)])
88+
df_small.to_json("binomial_runtimes_small.json")
89+
90+
df_large = summarize_times([('ozs large', times_ozs_large),
91+
('hard large', times_hard_large),
92+
('soft large', times_soft_large)])
93+
df_large.to_json("binomial_runtimes_large.json")
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// multi-level model for binomial data with 4 categorical predictors.
2+
data {
3+
int<lower=1> N; // number of strata
4+
int<lower=1> N_age;
5+
int<lower=1> N_eth;
6+
int<lower=1> N_edu;
7+
8+
array[N] int<lower=0> pos_tests;
9+
array[N] int<lower=0> tests;
10+
array[N] int<lower=1, upper=2> sex;
11+
array[N] int<lower=1, upper=N_age> age;
12+
array[N] int<lower=1, upper=N_eth> eth;
13+
array[N] int<lower=1, upper=N_edu> edu;
14+
15+
// hyperparameters
16+
real<lower=0, upper=1> sens;
17+
real<lower=0, upper=1> spec;
18+
}
19+
parameters {
20+
real beta_0;
21+
real beta_sex_raw;
22+
real<lower=0> sigma_age, sigma_eth, sigma_edu;
23+
vector[N_age - 1] beta_age_raw;
24+
vector[N_eth - 1] beta_eth_raw;
25+
vector[N_edu - 1] beta_edu_raw;
26+
}
27+
transformed parameters {
28+
vector[2] beta_sex = [beta_sex_raw, -beta_sex_raw]';
29+
30+
vector[N_age] beta_age = append_row(beta_age_raw, -sum(beta_age_raw));
31+
vector[N_eth] beta_eth = append_row(beta_eth_raw, -sum(beta_eth_raw));
32+
vector[N_edu] beta_edu = append_row(beta_edu_raw, -sum(beta_edu_raw));
33+
34+
vector[N] eta = inv_logit(beta_0 + beta_sex[sex] + beta_age[age] + beta_eth[eth] + beta_edu[edu]);
35+
vector[N] prob_pos_test = eta * sens + (1 - eta) * (1 - spec);
36+
}
37+
model {
38+
pos_tests ~ binomial(tests, prob_pos_test); // likelihood
39+
40+
// priors
41+
beta_0 ~ normal(0, 2.5);
42+
beta_sex ~ std_normal();
43+
// centered parameterization
44+
beta_age_raw ~ normal(0, sigma_age);
45+
beta_eth_raw ~ normal(0, sigma_eth);
46+
beta_edu_raw ~ normal(0, sigma_edu);
47+
sigma_eth ~ std_normal();
48+
sigma_age ~ std_normal();
49+
sigma_edu ~ std_normal();
50+
}
51+
generated quantities {
52+
array[N] int<lower=0>y_rep = binomial_rng(tests, prob_pos_test);
53+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// multi-level model for binomial data with 4 categorical predictors.
2+
data {
3+
int<lower=1> N; // number of strata
4+
int<lower=1> N_age;
5+
int<lower=1> N_eth;
6+
int<lower=1> N_edu;
7+
8+
// hyperparameters
9+
real<lower=0, upper=1> sens;
10+
real<lower=0, upper=1> spec;
11+
}
12+
parameters {
13+
real beta_0;
14+
real beta_sex_raw;
15+
real<lower=0> sigma_age, sigma_eth, sigma_edu;
16+
vector[N_age - 1] beta_age_raw;
17+
vector[N_eth - 1] beta_eth_raw;
18+
vector[N_edu - 1] beta_edu_raw;
19+
}
20+
transformed parameters {
21+
vector[2] beta_sex = [beta_sex_raw, -beta_sex_raw]';
22+
23+
vector[N_age] beta_age = append_row(beta_age_raw, -sum(beta_age_raw));
24+
vector[N_eth] beta_eth = append_row(beta_eth_raw, -sum(beta_eth_raw));
25+
vector[N_edu] beta_edu = append_row(beta_edu_raw, -sum(beta_edu_raw));
26+
}
27+
model {
28+
// priors
29+
beta_0 ~ normal(0, 2.5);
30+
beta_sex ~ std_normal();
31+
// centered parameterization
32+
beta_age_raw ~ normal(0, sigma_age);
33+
beta_eth_raw ~ normal(0, sigma_eth);
34+
beta_edu_raw ~ normal(0, sigma_edu);
35+
sigma_eth ~ std_normal();
36+
sigma_age ~ std_normal();
37+
sigma_edu ~ std_normal();
38+
}

0 commit comments

Comments
 (0)