8 Week 8
2021-09-08 [updated: 2022-01-26]
8.1 Question 1
8.1.1 Data
DT <- data_reedfrogs()
precis(DT)
## mean sd 5.5% 94.5% histogram
## density 23.33 10.38 10.00 35 ▇▁▁▁▁▁▁▇▁▁▁▁▇
## pred NaN NA NA NA
## size NaN NA NA NA
## surv 16.31 9.88 4.58 33 ▂▇▂▁▃▁▃
## propsurv 0.72 0.27 0.29 1 ▁▁▃▁▁▂▁▅▇
writeLines(readLines(tar_read(stan_d_file_w08_model_frogs_1)))
## data {
## int N;
## int survival[N];
## int density[N];
## int tank[N];
## }
## parameters {
## real sigma;
## real alpha[N];
## real alpha_bar;
## }
## transformed parameters {
## vector[N] p;
##
## for (i in 1:N) {
## p[i] = inv_logit(alpha[i]);
## }
## }
## model {
## alpha_bar ~ normal(0, 0.25);
## alpha ~ normal(alpha_bar, sigma);
## sigma ~ exponential(1);
## for (i in 1:N) {
## survival[i] ~ binomial(density[i], p[i]);
## }
## }
model_frogs_1_draws <- tar_read(stan_d_mcmc_w08_model_frogs_1)$draws()
mcmc_areas(model_frogs_1_draws, regex_pars = 'alpha')

mcmc_areas(model_frogs_1_draws, regex_pars = 'p\\[')

writeLines(readLines(tar_read(stan_d_file_w08_model_frogs_2)))
## data {
## int N;
## int survival[N];
## int density[N];
## int tank[N];
## int predation[N];
## }
## parameters {
## real sigma;
## real alpha[N];
## real alpha_bar;
## real beta_predation;
## }
## transformed parameters {
## vector[N] p;
##
## for (i in 1:N) {
## p[i] = inv_logit(alpha[i] + beta_predation * predation[i]);
## }
## }
## model {
## alpha_bar ~ normal(0, 0.25);
## alpha ~ normal(alpha_bar, sigma);
## beta_predation ~ normal(0, 0.5);
## sigma ~ exponential(1);
## for (i in 1:N) {
## survival[i] ~ binomial(density[i], p[i]);
## }
## }
model_frogs_2_draws <- tar_read(stan_d_mcmc_w08_model_frogs_2)$draws()
mcmc_areas(model_frogs_2_draws, regex_pars = 'predation')

writeLines(readLines(tar_read(stan_d_file_w08_model_frogs_3)))
## data {
## int N;
## int survival[N];
## int density[N];
## int tank[N];
## int size[N];
## }
## parameters {
## real sigma;
## real alpha[N];
## real alpha_bar;
## real beta_size;
## }
## transformed parameters {
## vector[N] p;
##
## for (i in 1:N) {
## p[i] = inv_logit(alpha[i] + beta_size * size[i]);
## }
## }
## model {
## alpha_bar ~ normal(0, 0.25);
## alpha ~ normal(alpha_bar, sigma);
## beta_size ~ normal(0, 0.5);
## sigma ~ exponential(1);
## for (i in 1:N) {
## survival[i] ~ binomial(density[i], p[i]);
## }
## }
model_frogs_3_draws <- tar_read(stan_d_mcmc_w08_model_frogs_3)$draws()
mcmc_areas(model_frogs_3_draws, regex_pars = 'size')

writeLines(readLines(tar_read(stan_d_file_w08_model_frogs_4)))
## data {
## int N;
## int survival[N];
## int density[N];
## int tank[N];
## int size[N];
## int predation[N];
## }
## parameters {
## real sigma;
## real alpha[N];
## real alpha_bar;
## real beta_size;
## real beta_predation;
## real beta_interaction;
## }
## transformed parameters {
## vector[N] p;
##
## for (i in 1:N) {
## p[i] = inv_logit(alpha[i] + beta_size * size[i] + beta_predation * predation[i] + beta_interaction * (size[i] * predation[i]));
## }
## }
## model {
## alpha_bar ~ normal(0, 0.25);
## alpha ~ normal(alpha_bar, sigma);
## beta_size ~ normal(0, 0.5);
## beta_predation ~ normal(0, 0.5);
## beta_interaction ~ normal(0, 0.25);
## sigma ~ exponential(1);
## for (i in 1:N) {
## survival[i] ~ binomial(density[i], p[i]);
## }
## }
model_frogs_4_draws <- tar_read(stan_d_mcmc_w08_model_frogs_4)$draws()
mcmc_areas(model_frogs_4_draws, regex_pars = 'beta')

Negative influence of predation is somewhat balanced by size of tank.
8.2 Question 2
In 1980, a typical Bengali woman could have 5 or more children in her lifetime. By the year 2000, a typical Bengali woman had only 2 or 3. You’re going to look at a historical set of data, when contraception was widely available but many families chose not to use it. These data reside in
data(bangladesh)
and come from the 1988 Bangladesh Fertility Survey. Each row is one of 1934 women. There are six variables, but you can focus on two of them for this practice problem: (1) district: ID number of administrative district each woman resided in (2) use.contraception: An indicator (0/1) of whether the woman was using contraception.
DT <- data_bangladesh()
precis(DT)
## mean sd 5.5% 94.5% histogram
## woman 967.500000000000000000 558.44 107.3 1827.7 ▇▇▇▇▇▇▇▇▇▅
## district 29.252843846949328821 17.80 1.0 57.0 ▇▅▇▅▅▇▅▃▅▅▅▅
## use.contraception 0.392450879007238906 0.49 0.0 1.0 ▇▁▁▁▁▁▁▁▁▅
## living.children 2.652016546018614473 1.24 1.0 4.0 ▅▃▁▃▁▇
## age.centered 0.002197880041365139 9.01 -12.6 16.4 ▅▇▇▅▃▃▂
## urban 0.290589451913133401 0.45 0.0 1.0 ▇▁▁▁▁▁▁▁▁▃
## id 967.500000000000000000 558.44 107.3 1827.7 ▇▇▇▇▇▇▇▇▇▅
## contraception 0.392450879007238906 0.49 0.0 1.0 ▇▁▁▁▁▁▁▁▁▅
## scale_age 0.000000000000000014 1.00 -1.4 1.8 ▁▇▇▅▇▃▃▂▁
Now, focus on predicting use.contraception
, clustered by district_id
. Fit
both (1) a traditional fixed-effects model that uses an index variable for
district and (2) a multilevel model with varying intercepts for district. Plot
the predicted proportions of women in each district using contraception, for
both the fixed-effects model and the varying-effects model. That is, make a plot
in which district ID is on the horizontal axis and expected proportion using
contraception is on the vertical.Make one plot for each model, or layer them on
the same plot, as you prefer. How do the models disagree? Can you explain the
pattern of disagreement? In particular, can you explain the most extreme cases
of disagreement, both why they happen where they do and why the models reach
different inferences?
Fixed effects model
writeLines(readLines(tar_read(stan_e_file_w08_model_bang_fixed)))
## data {
## // Integers for number of rows, and number of districts
## int<lower=0> N;
## int<lower=0> N_district;
##
## // District and contraception, expecting integers of length N
## int district[N];
## int contraception[N];
## }
## parameters {
## // Alpha vector matching length of number of districts
## vector[N_district] alpha;
## }
## model {
## // p vector matching length of number of districts
## vector[N] p;
##
## // Alpha is distributed normally
## alpha ~ normal(0, 1.5);
##
## // For each for in data, alpha for that row's district
## for (i in 1:N) {
## p[i] = inv_logit(alpha[district[i]]);
## }
##
## // Contraception if distributed with bernoulli, p
## contraception ~ bernoulli(p);
## }
model_bang_fixed_draws <- tar_read(stan_e_mcmc_w08_model_bang_fixed)$draws()
mcmc_areas(model_bang_fixed_draws, regex_pars = 'alpha', transformations = inv.logit)

Multilevel model
writeLines(readLines(tar_read(stan_e_file_w08_model_bang_multi)))
## data {
## // Integers for number of rows, and number of districts
## int<lower=0> N;
## int<lower=0> N_district;
##
## // District and contraception, expecting integers of length N
## int district[N];
## int contraception[N];
## }
## parameters {
## // Alpha vector matching length of number of districts
## vector[N_district] alpha;
## real<lower=0> sigma;
##
## // Hyper parameter alpha bar
## real alpha_bar;
## }
## model {
## // p vector matching length of number of districts
## vector[N] p;
##
## // For each for in data, alpha for that row's district
## for (i in 1:N) {
## p[i] = inv_logit(alpha[district[i]]);
## }
##
## // Hyper priors: alpha bar and sigma
## alpha_bar ~ normal(0, 1.5);
## sigma ~ exponential(1);
##
## // Priors
## // Alpha is distributed normally
## alpha ~ normal(alpha_bar, sigma);
##
## // Contraception if distributed with bernoulli, p
## contraception ~ bernoulli(p);
## }
model_bang_multi_draws <- tar_read(stan_e_mcmc_w08_model_bang_multi)$draws()
mcmc_trace(model_bang_multi_draws)

mcmc_areas(model_bang_multi_draws, regex_pars = 'alpha', transformations = inv.logit)

mcmc_areas(model_bang_multi_draws, pars = c('alpha_bar', 'sigma'))

Comparison
(mcmc_areas(model_bang_fixed_draws, regex_pars = 'alpha', transformations = inv.logit) + labs(title = 'fixed')) +
(mcmc_areas(model_bang_multi_draws, regex_pars = 'alpha\\[', transformations = inv.logit) + labs(title = 'multilevel'))

setDT(model_bang_fixed_draws)
setDT(model_bang_multi_draws)
compare <- rbindlist(list(
melt(model_bang_fixed_draws, measure.vars = patterns('alpha'))[, model_type := 'fixed'],
melt(model_bang_multi_draws, measure.vars = patterns('alpha'))[, model_type := 'multilevel']
), fill = TRUE)
compare[, variable := as.integer(gsub('alpha\\[|\\]', '', variable))]
## Warning in eval(jsub, SDenv, parent.frame()): NAs introduced by coercion
ggplot(compare[, .(value = mean(value)), .(variable, model_type)]) +
geom_hline(yintercept = 0, alpha = 0.5) +
geom_point(aes(variable, value, color = model_type)) +
geom_line(aes(variable, value, group = model_type), alpha = 0.2) +
scale_color_viridis_d(begin = 0.3, end = 0.8) +
labs(x = 'district', y = 'alpha')
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).

8.3 Question 3
Return to the Trolley data,
data(Trolley)
, from Chapter 12. Define and fit a varying intercepts model for these data. By this I mean to add an intercept parameter for the individual to the linear model. Cluster the varying intercepts on individual participants, as indicated by the unique values in the id variable. Include action, intention, and contact as before. Compare the varying intercepts model and a model that ignores individuals, using both WAIC/LOO and posterior predictions. What is the impact of individual variation in these data?
DT <- data_trolley()
precis(DT)
## mean sd 5.5% 94.5% histogram
## case NaN NA NA NA
## response 4.20 1.91 1 7 ▃▂▁▃▁▇▁▅▁▅▁▅
## order 16.50 9.29 2 31 ▇▅▇▇▅▇▂
## id NaN NA NA NA
## age 37.49 14.23 18 61 ▂▇▅▇▅▇▅▅▃▅▂▁▁
## male 0.57 0.49 0 1 ▅▁▁▁▁▁▁▁▁▇
## edu NaN NA NA NA
## action 0.43 0.50 0 1 ▇▁▁▁▁▁▁▁▁▅
## intention 0.47 0.50 0 1 ▇▁▁▁▁▁▁▁▁▇
## contact 0.20 0.40 0 1 ▇▁▁▁▁▁▁▁▁▂
## story NaN NA NA NA
## action2 0.63 0.48 0 1 ▃▁▁▁▁▁▁▁▁▇
## education 5.73 1.35 3 8 ▁▁▁▁▁▂▁▅▁▇▁▂▁▂
## individual 166.00 95.56 19 313 ▇▇▇▇▇▇▅
writeLines(readLines(tar_read(stan_c_file_w08_model_trolley)))
## data {
## int N;
## int K;
## int response[N];
## int action[N];
## int intention[N];
## int contact[N];
##
## }
## parameters {
## // Cut points are the positions of responses along cumulative odds
## ordered[K] cutpoints;
## real beta_action;
## real beta_intention;
## real beta_contact;
## real beta_intention_contact;
## real beta_intention_action;
## }
## transformed parameters {
## vector[N] phi;
##
## for (i in 1:N) {
## phi[i] = beta_action * action[i] + beta_contact * contact[i] + beta_intention * intention[i] + beta_intention_contact * intention[i] * contact[i] + beta_intention_action * intention[i] * action[i];
## }
## }
## model {
## response ~ ordered_logistic(phi, cutpoints);
## cutpoints ~ normal(0, 1.5);
## beta_action ~ normal(0, 0.5);
## beta_contact ~ normal(0, 0.5);
## beta_intention ~ normal(0, 0.5);
## }
## generated quantities {
## vector[N] log_lik;
## for (i in 1:N) {
## log_lik[i] = ordered_logistic_lpmf(response[i] | phi[i], cutpoints);
## }
## }
model_trolley <- tar_read(stan_c_mcmc_w08_model_trolley)
model_trolley_draws <- model_trolley$draws()
mcmc_areas(model_trolley_draws, regex_pars = 'cutpoints')

mcmc_areas(model_trolley_draws, regex_pars = 'beta')

options(cmdstanr_draws_format = "draws_array")
model_trolley_loo <- model_trolley$loo()
options(cmdstanr_draws_format = "draws_df")
writeLines(readLines(tar_read(stan_c_file_w08_model_trolley_multi)))
## data {
## int N;
## int K;
## int response[N];
## int action[N];
## int intention[N];
## int contact[N];
## int individual[N];
## int N_individual;
## }
## parameters {
## // Cut points are the positions of responses along cumulative odds
## ordered[K] cutpoints;
## real beta_action;
## real beta_intention;
## real beta_contact;
## real beta_intention_contact;
## real beta_intention_action;
## real alpha_bar;
## real sigma;
## vector[N_individual] alpha;
## }
## transformed parameters {
## vector[N] phi;
##
## for (i in 1:N) {
## phi[i] = alpha[individual[i]] + beta_action * action[i] +
## beta_contact * contact[i] + beta_intention * intention[i] +
## beta_intention_contact * intention[i] * contact[i] +
## beta_intention_action * intention[i] * action[i];
## }
## }
## model {
## // Hyper parameter priors
## alpha_bar ~ normal(0, 1.5);
## sigma ~ exponential(1);
##
## // Priors
## alpha ~ normal(alpha_bar, sigma);
##
## response ~ ordered_logistic(phi, cutpoints);
## cutpoints ~ normal(0, 1.5);
## beta_action ~ normal(0, 0.5);
## beta_contact ~ normal(0, 0.5);
## beta_intention ~ normal(0, 0.5);
## }
## generated quantities {
## vector[N] log_lik;
## for (i in 1:N) {
## log_lik[i] = ordered_logistic_lpmf(response[i] | phi[i], cutpoints);
## }
## }
model_trolley_multi <- tar_read(stan_c_mcmc_w08_model_trolley_multi)
model_trolley_multi_draws <- model_trolley_multi$draws()
mcmc_areas(model_trolley_multi_draws, regex_pars = 'cutpoints')

mcmc_areas(model_trolley_multi_draws, regex_pars = 'beta')

options(cmdstanr_draws_format = "draws_array")
model_trolley_multi_loo <- model_trolley_multi$loo()
options(cmdstanr_draws_format = "draws_df")
col_patterns <- '^beta|^alpha|^sigma|^cutpoints'
setDT(model_trolley_draws)
setDT(model_trolley_multi_draws)
precis(model_trolley_draws[, .SD, .SDcols = patterns(col_patterns)])
## 6 vector or matrix parameters hidden. Use depth=2 to show them.
## mean sd 5.5% 94.5% histogram
## beta_action -0.46 0.055 -0.54 -0.37 ▁▁▁▃▇▇▂▁▁▁
## beta_intention -0.27 0.058 -0.36 -0.18 ▁▁▁▅▇▅▂▁▁▁
## beta_contact -0.32 0.069 -0.43 -0.21 ▁▁▁▂▅▇▇▃▁▁▁
## beta_intention_contact -1.29 0.097 -1.44 -1.13 ▁▁▁▁▃▅▇▇▇▃▂▁▁▁▁
## beta_intention_action -0.46 0.081 -0.59 -0.34 ▁▁▁▁▃▅▇▇▅▂▁▁▁
precis(model_trolley_multi_draws[, .SD, .SDcols = patterns(col_patterns)])
## 337 vector or matrix parameters hidden. Use depth=2 to show them.
## mean sd 5.5% 94.5% histogram
## beta_action -0.63 0.056 -0.72 -0.54 ▁▁▂▇▇▅▁▁▁
## beta_intention -0.36 0.060 -0.45 -0.26 ▁▁▁▃▇▇▃▁▁▁
## beta_contact -0.42 0.072 -0.54 -0.30 ▁▁▁▂▇▇▇▃▁▁▁
## beta_intention_contact -1.74 0.101 -1.91 -1.58 ▁▁▁▃▇▅▁▁▁
## beta_intention_action -0.60 0.082 -0.73 -0.46 ▁▁▁▁▂▅▇▇▅▂▁▁▁
## alpha_bar 1.01 0.437 0.26 1.72 ▁▁▁▂▂▂▇▇▅▂▂▁▁
## sigma 1.92 0.082 1.80 2.06 ▁▁▁▁▃▇▇▅▃▁▁▁▁▁
compared <- loo_compare(model_trolley_loo, model_trolley_multi_loo)
print(compared, simplify = FALSE)
## elpd_diff se_diff elpd_loo se_elpd_loo p_loo se_p_loo looic
## model2 0.0 0.0 -15529.0 89.8 356.8 4.7 31058.0
## model1 -2935.7 86.8 -18464.7 40.5 11.1 0.1 36929.5
## se_looic
## model2 179.6
## model1 81.1