library(tidyverse)
library(psych)
library(cowplot)
library(patchwork)
library(here)
library(brms)
library(tidybayes)
Entropy is a measure of uncertainty. Distributions with high entropy have more uncertainty because they have more possibilities.
# Create example data
set.seed(123)
# Low entropy data (concentrated around a single value)
low_entropy <- data.frame(
value = sample(x = c(1:6), 10000, replace = T, prob = c(.3, .25, .2, .1, .05, 0)),
type = "Low Entropy"
)
# High entropy data (more spread out, more uniform)
high_entropy <- data.frame(
value = sample(x = c(1:6), 10000, replace = T),
type = "High Entropy"
)
# Combine the data
entropy_data <- rbind(low_entropy, high_entropy)
# Create the plots
ggplot(entropy_data, aes(x = value)) +
geom_histogram(fill = "#1c5253", alpha = 0.5, binwidth = 1, color = "white") +
facet_wrap(~type) +
labs(title = "Comparing High and Low Entropy Distributions",
x = "Value",
y = "Density") +
theme_cowplot() +
theme(strip.background = element_rect(fill = "#1c5253"),
strip.text = element_text(color = "white", size = 12))
When you choose a prior and a likelihood function for your data – the distribution representing the behavior of your outcome – the best choice is the distribution that maximizes entropy while still meeting the constraints of your variable.
First, the distribution with the biggest entropy is the widest and least informative distribution. Choosing the distribution with the largest entropy means spreading probability as evenly as possible, while still remaining consistent with anything we think we know about a process.
Second, nature tends to produce empirical distributions that have high entropy.
Third, it works.
So many! But here are a few:
In general, this distribution is your best bet for binary outcomes.
# Create data for different rate parameters
x <- seq(0, 100, by=1)
N <- c(1, 5, 100)
p = c(.25, .40, .70)
binom_data <- expand.grid(x = x, N=N, p=p) %>%
filter(x <= N) %>%
mutate(density = dbinom(x, prob = p, size=N),
p=str_c("p = ", p),
N = factor(N, levels=c(1, 5, 100), labels=c("001", "005", "100")),
N=str_c("N = ",N))
ggplot(binom_data, aes(x = x, y = density, fill = p)) +
#geom_line(linewidth = 1) +
geom_bar(stat="identity") +
labs(title = "Binomial Distribution with Different Parameters",
x = "x",
y = "Density",
color = "p") +
theme_cowplot() +
facet_wrap(p~N, scales="free") +
guides(color=F, fill=F)
# Create data for different rate parameters
x <- seq(0, 5, length.out = 1000)
rates <- c(0.5, 1, 2)
exp_data <- expand.grid(x = x, rate = rates) %>%
mutate(density = dexp(x, rate),
rate = factor(rate, labels = paste("λ =", rates)))
ggplot(exp_data, aes(x = x, y = density, color = rate)) +
geom_line(size = 1) +
labs(title = "Exponential Distribution with Different Rate Parameters",
x = "x",
y = "Density",
color = "Rate") +
theme_cowplot() +
scale_color_manual(values = c("#1c5253", "#5e8485", "#0f393a"))
# Create data for different shape parameters
x <- seq(0, 10, length.out = 1000)
shapes <- list(c(1, 1), c(2, 2), c(5, 1))
names <- c("a=1, b=1", "a=2, b=2", "a=5, b=1")
gamma_data <- map_df(seq_along(shapes), function(i) {
data.frame(
x = x,
density = dgamma(x, shape = shapes[[i]][1], rate = shapes[[i]][2]),
params = names[i]
)
})
ggplot(gamma_data, aes(x = x, y = density, color = params)) +
geom_line(size = 1) +
labs(title = "Gamma Distribution with Different Shape and Rate Parameters",
x = "x",
y = "Density",
color = "Parameters") +
theme_cowplot() +
scale_color_manual(values = c("#1c5253", "#5e8485", "#0f393a"))
# Create data for different lambda parameters
x <- 0:15
lambdas <- c(1, 3, 7)
pois_data <- expand.grid(x = x, lambda = lambdas) %>%
mutate(probability = dpois(x, lambda),
lambda = factor(lambda, labels = paste("λ =", lambdas)))
ggplot(pois_data, aes(x = x, y = probability, fill = lambda)) +
geom_col(position = "dodge", alpha = 0.7) +
labs(title = "Poisson Distribution with Different Rate Parameters",
x = "Count",
y = "Probability",
fill = "Lambda") +
theme_cowplot() +
scale_fill_manual(values = c("#1c5253", "#e07a5f", "#f2cc8f"))
From McElreath’s lecture:
# generative model, basic mediator scenario
set.seed(0319)
N <- 1000 # number of applicants
# even gender distribution
G <- sample( 1:2, size=N, replace=TRUE )
# gender 1 tends to apply to department 1, 2 to 2
D <- rbinom( n=N, size=1, prob=ifelse( G==1 , 0.3 , 0.8 ) ) + 1
# matrix of acceptance rates
accept_rate <- matrix( c(0.5, 0.2, 0.1, 0.3), nrow=2)
# simulate acceptance
A <- rbinom( n=N, size=1, accept_rate[D,G])
dat <- data.frame(D=as.character(D), G=as.character(G), A)
We’re going to fit two models here, one estimating the total effect of gender on acceptance, and one estimating the direct effect stratifying on department.
Ai∼Bernoulli(pi)logit(pi)=α[Gi]αj∼Normal(0,1)
m1 = brm(
data = dat,
family = bernoulli,
A ~ 0 + G,
prior = c( prior( normal(0, 1), class = b)),
iter = 5000, warmup = 1000, chains = 4,
seed = 3,
file = here("files/models/51.1")
)
## Compiling Stan program...
## Trying to compile a simple C file
## Start sampling
m1
## Family: bernoulli
## Links: mu = logit
## Formula: A ~ 0 + G
## Data: dat (Number of observations: 1000)
## Draws: 4 chains, each with iter = 5000; warmup = 1000; thin = 1;
## total post-warmup draws = 16000
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## G1 -1.64 0.13 -1.90 -1.40 1.00 12575 9503
## G2 -1.04 0.10 -1.23 -0.85 1.00 14831 10819
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Ai∼Bernoulli(pi)logit(pi)=α[Gi,Di]αj∼Normal(0,1)
m2 = brm(
data = dat,
family = bernoulli,
A ~ G*D,
prior = c( prior( normal(0, 1), class = Intercept),
prior( normal(0, 1), class = b)),
iter = 5000, warmup = 1000, chains = 4,
seed = 3,
file = here("files/models/51.2")
)
## Compiling Stan program...
## Trying to compile a simple C file
## Start sampling
m2
## Family: bernoulli
## Links: mu = logit
## Formula: A ~ G * D
## Data: dat (Number of observations: 1000)
## Draws: 4 chains, each with iter = 5000; warmup = 1000; thin = 1;
## total post-warmup draws = 16000
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept -2.07 0.16 -2.39 -1.75 1.00 9026 10062
## G2 0.33 0.29 -0.25 0.88 1.00 7352 8462
## D2 1.15 0.24 0.68 1.63 1.00 7984 8930
## G2:D2 -0.33 0.35 -1.00 0.35 1.00 6300 7196
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
posterior_summary(m1) %>% round(2)
## Estimate Est.Error Q2.5 Q97.5
## b_G1 -1.64 0.13 -1.90 -1.40
## b_G2 -1.04 0.10 -1.23 -0.85
## lprior -3.74 0.23 -4.22 -3.32
## lp__ -515.12 0.99 -517.85 -514.16
posterior_summary(m2) %>% round(2)
## Estimate Est.Error Q2.5 Q97.5
## b_Intercept -2.07 0.16 -2.39 -1.75
## b_G2 0.33 0.29 -0.25 0.88
## b_D2 1.15 0.24 0.68 1.63
## b_G2:D2 -0.33 0.35 -1.00 0.35
## Intercept -1.39 0.08 -1.55 -1.23
## lprior -5.55 0.47 -6.67 -4.84
## lp__ -502.34 1.40 -505.96 -500.60
A reminder that we’re running a LOGISTIC REGRESSION, which is a special case of a BINOMIAL REGRESSION. We have used a link function – the logit link function – to convert our outcome (probability) into something that is not bounded and can be estimated using linear equations. But now we need to go back from the logit to the probability.
logit=log(p1−p)
p=exp(q)1+exp(q)
It doesn’t matter how you fit the model. Use
add_epred_draws() to get estimated values for each
combination. This also gives you the probability, not the logit.
Yay!
new_dat = expand.grid(G = c("1", "2"),
D = c("1", "2"))
m1 %>% add_epred_draws(newdata = new_dat) %>%
head()
## # A tibble: 6 × 7
## # Groups: G, D, .row [1]
## G D .row .chain .iteration .draw .epred
## <fct> <fct> <int> <int> <int> <int> <dbl>
## 1 1 1 1 NA NA 1 0.149
## 2 1 1 1 NA NA 2 0.141
## 3 1 1 1 NA NA 3 0.153
## 4 1 1 1 NA NA 4 0.174
## 5 1 1 1 NA NA 5 0.186
## 6 1 1 1 NA NA 6 0.175
Use add_epred_draws to get the posterior distribution
for p for each combination of department and gender.
m1 %>% add_epred_draws(newdata = new_dat) %>%
median_qi
## # A tibble: 4 × 9
## G D .row .epred .lower .upper .width .point .interval
## <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 1 1 1 0.163 0.130 0.197 0.95 median qi
## 2 1 2 3 0.163 0.130 0.197 0.95 median qi
## 3 2 1 2 0.261 0.226 0.299 0.95 median qi
## 4 2 2 4 0.261 0.226 0.299 0.95 median qi
m2 %>% add_epred_draws(newdata = new_dat) %>%
median_qi
## # A tibble: 4 × 9
## G D .row .epred .lower .upper .width .point .interval
## <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 1 1 1 0.113 0.0836 0.148 0.95 median qi
## 2 1 2 3 0.287 0.217 0.367 0.95 median qi
## 3 2 1 2 0.151 0.0943 0.221 0.95 median qi
## 4 2 2 4 0.287 0.246 0.330 0.95 median qi
Our other useful function add_predicted_draws() is back
in the realm of the outcome: 1’s and 0’s.
m1 %>% add_predicted_draws(newdata = new_dat) %>%
head
## # A tibble: 6 × 7
## # Groups: G, D, .row [1]
## G D .row .chain .iteration .draw .prediction
## <fct> <fct> <int> <int> <int> <int> <int>
## 1 1 1 1 NA NA 1 0
## 2 1 1 1 NA NA 2 0
## 3 1 1 1 NA NA 3 0
## 4 1 1 1 NA NA 4 0
## 5 1 1 1 NA NA 5 0
## 6 1 1 1 NA NA 6 1
Use add_predicted_draws() to simulate fictional people
in these departments:
m1 %>% add_predicted_draws(newdata = new_dat) %>%
ungroup() %>%
count(G, D, .prediction)
## # A tibble: 8 × 4
## G D .prediction n
## <fct> <fct> <int> <int>
## 1 1 1 0 13363
## 2 1 1 1 2637
## 3 1 2 0 13357
## 4 1 2 1 2643
## 5 2 1 0 11800
## 6 2 1 1 4200
## 7 2 2 0 11796
## 8 2 2 1 4204
m2 %>% add_predicted_draws(newdata = new_dat) %>%
ungroup() %>%
count(G, D, .prediction)
## # A tibble: 8 × 4
## G D .prediction n
## <fct> <fct> <int> <int>
## 1 1 1 0 14178
## 2 1 1 1 1822
## 3 1 2 0 11463
## 4 1 2 1 4537
## 5 2 1 0 13527
## 6 2 1 1 2473
## 7 2 2 0 11351
## 8 2 2 1 4649
m1_epred = add_epred_draws(new_dat, m1)
m1_pred = add_predicted_draws(new_dat, m1)
full_join(m1_epred, m1_pred) %>%
pivot_longer(cols = c(".epred", ".prediction")) %>%
ggplot(aes(x = value)) +
geom_histogram(position="dodge") +
facet_grid(G~name, scales="free")
## Joining with `by = join_by(G, D, .row, .chain, .iteration, .draw)`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Using the data UCBadmit from the rethinking
package, fit
Hint: these data are given in the aggregated form, not the long form. You’ll need to use a binomial distribution, not a bernoulli. And you’ll need to specify the number of trials. So your formula will look like this:
admit | trials(applications) ~ [predictors go here]
data(UCBadmit, package = "rethinking")
UCBadmit$gender = UCBadmit$applicant.gender
m3 <- brm(
data = UCBadmit,
family = binomial,
admit | trials(applications) ~ 0 + gender,
prior = c( prior( normal(0, 2), class = b) ),
iter = 5000, warmup = 1000, chains = 4,
seed = 3,
file = here("files/models/51.3")
)
## Compiling Stan program...
## Trying to compile a simple C file
## Start sampling
m4 <- brm(
data = UCBadmit,
family = binomial,
admit | trials(applications) ~ gender*dept,
prior = c( prior( normal(0, 2), class = Intercept),
prior( normal(0, 2), class = b) ),
iter = 5000, warmup = 1000, chains = 4,
seed = 3,
file = here("files/models/51.4")
)
## Compiling Stan program...
## Trying to compile a simple C file
## Start sampling
Explore your options with get_variables() and use the
spread draws() function to sample from the posterior
distribution of one of your models. What does this return?
get_variables(m3)
## [1] "b_genderfemale" "b_gendermale" "lprior" "lp__"
## [5] "accept_stat__" "stepsize__" "treedepth__" "n_leapfrog__"
## [9] "divergent__" "energy__"
m3 %>%
spread_draws(b_genderfemale, b_gendermale)
## # A tibble: 16,000 × 5
## .chain .iteration .draw b_genderfemale b_gendermale
## <int> <int> <int> <dbl> <dbl>
## 1 1 1 1 -0.914 -0.169
## 2 1 2 2 -0.757 -0.268
## 3 1 3 3 -0.742 -0.274
## 4 1 4 4 -0.697 -0.284
## 5 1 5 5 -0.940 -0.200
## 6 1 6 6 -0.841 -0.247
## 7 1 7 7 -0.828 -0.171
## 8 1 8 8 -0.810 -0.241
## 9 1 9 9 -0.814 -0.294
## 10 1 10 10 -0.783 -0.140
## # ℹ 15,990 more rows
get_variables(m4)
## [1] "b_Intercept" "b_gendermale" "b_deptB"
## [4] "b_deptC" "b_deptD" "b_deptE"
## [7] "b_deptF" "b_gendermale:deptB" "b_gendermale:deptC"
## [10] "b_gendermale:deptD" "b_gendermale:deptE" "b_gendermale:deptF"
## [13] "Intercept" "lprior" "lp__"
## [16] "accept_stat__" "stepsize__" "treedepth__"
## [19] "n_leapfrog__" "divergent__" "energy__"
m4 %>%
spread_draws(b_Intercept, b_gendermale, b_deptB, b_deptC,
b_deptD, b_deptE, b_deptF,
`b_gendermale:deptB`,`b_gendermale:deptC`,
`b_gendermale:deptD`, `b_gendermale:deptE`,
`b_gendermale:deptF`)
## # A tibble: 16,000 × 15
## .chain .iteration .draw b_Intercept b_gendermale b_deptB b_deptC b_deptD
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 1 1.23 -0.756 -1.28 -1.83 -1.62
## 2 1 2 2 1.16 -0.744 -0.700 -1.65 -1.69
## 3 1 3 3 1.33 -0.759 -0.133 -1.90 -1.99
## 4 1 4 4 1.12 -0.544 0.0159 -1.65 -1.60
## 5 1 5 5 1.21 -0.707 -0.648 -1.88 -1.86
## 6 1 6 6 1.36 -0.826 -0.433 -2.02 -1.89
## 7 1 7 7 1.13 -0.643 -0.656 -1.89 -1.80
## 8 1 8 8 1.20 -0.701 -0.461 -1.78 -1.61
## 9 1 9 9 1.03 -0.602 -0.758 -1.70 -1.35
## 10 1 10 10 1.12 -0.509 -0.605 -1.71 -1.72
## # ℹ 15,990 more rows
## # ℹ 7 more variables: b_deptE <dbl>, b_deptF <dbl>, `b_gendermale:deptB` <dbl>,
## # `b_gendermale:deptC` <dbl>, `b_gendermale:deptD` <dbl>,
## # `b_gendermale:deptE` <dbl>, `b_gendermale:deptF` <dbl>
Use the add_epred_draws() function to estimate expected
values based on your models. What is the unit on .epred for
this model?
UCBadmit %>% add_epred_draws(m3) %>%
ungroup() %>%
group_by(gender) %>%
mean_qi(.epred)
## # A tibble: 2 × 7
## gender .epred .lower .upper .width .point .interval
## <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 female 92.9 7.32 187. 0.95 mean qi
## 2 male 200. 83.1 376. 0.95 mean qi
Recreate this figure (from the model without department): this figure shows that women applicants are accepted at a rate .09 to .18 lower than male applicants.
m3 %>%
spread_draws(b_genderfemale, b_gendermale) %>%
mutate(
across(starts_with("b"),
rethinking::inv_logit),
diff = b_genderfemale - b_gendermale) %>%
ggplot(aes(x = diff)) +
geom_density(color ="#1c5253", linewidth = 2) +
labs(x = "P_female - P_male")
Recreate this figure (from the model with department).
new_dat = distinct(UCBadmit, gender, dept) %>%
mutate(applications = 1e5)
add_epred_draws(new_dat, m4) %>%
ungroup() %>%
select(dept, gender, .draw, .epred) %>%
pivot_wider(names_from = gender, values_from = .epred) %>%
mutate(diff = (female-male)/1e5) %>%
ggplot(aes(x = diff, color = dept)) +
geom_density(linewidth = 2) +
labs(x = "P_female - P_male") +
guides(color = "none")
Calculate the probability of acceptance for each gender within each department for both your models. Plot the results.
m3_epred = UCBadmit %>% add_epred_draws(m3) %>%
mutate(prob = .epred/applications) %>%
median_qi(prob) %>%
mutate(model = "m3")
m4_epred = UCBadmit %>% add_epred_draws(m4) %>%
mutate(prob = .epred/applications) %>%
median_qi(prob) %>%
mutate(model = "m4")
m3_epred %>% full_join(m4_epred) %>%
ggplot(
aes(x = dept, y = prob, color = gender)
) +
geom_errorbar(
aes(ymin = .lower, ymax=.upper),
width = .5) +
geom_point() +
facet_wrap(~model)
## Joining with `by = join_by(dept, applicant.gender, admit, reject, applications,
## gender, .row, prob, .lower, .upper, .width, .point, .interval, model)`