library(tidyverse)
library(psych)
library(cowplot)
library(patchwork)
library(here)
library(brms) 
library(tidybayes) 

entropy

Entropy is a measure of uncertainty. Distributions with high entropy have more uncertainty because they have more possibilities.

# Create example data
set.seed(123)

# Low entropy data (concentrated around a single value)
low_entropy <- data.frame(
  value = sample(x = c(1:6), 10000, replace = T, prob = c(.3, .25, .2, .1, .05, 0)),
  type = "Low Entropy"
)

# High entropy data (more spread out, more uniform)
high_entropy <- data.frame(
  value = sample(x = c(1:6), 10000, replace = T),
  type = "High Entropy"
)

# Combine the data
entropy_data <- rbind(low_entropy, high_entropy)

# Create the plots
ggplot(entropy_data, aes(x = value)) +
  geom_histogram(fill = "#1c5253", alpha = 0.5, binwidth = 1, color = "white") +
  facet_wrap(~type) +
  labs(title = "Comparing High and Low Entropy Distributions",
       x = "Value",
       y = "Density") +
  theme_cowplot() +
  theme(strip.background = element_rect(fill = "#1c5253"),
        strip.text = element_text(color = "white", size = 12))

When you choose a prior and a likelihood function for your data – the distribution representing the behavior of your outcome – the best choice is the distribution that maximizes entropy while still meeting the constraints of your variable.


maximum entropy


what are your options?

So many! But here are a few:


binomial

  • outcome of each trial is binary (yes/no, happened/didn’t happen)
  • fixed number of trials
    • A Bernoulli model is a special case of the binomial with only 1 trial
  • probability of outcome is the same across trials
  • trials are independent

In general, this distribution is your best bet for binary outcomes.


# Create data for different rate parameters
x <- seq(0, 100, by=1)
N <- c(1, 5, 100)
p = c(.25, .40, .70)

binom_data <- expand.grid(x = x, N=N, p=p) %>%
  filter(x <= N) %>% 
  mutate(density = dbinom(x, prob = p, size=N),
         p=str_c("p = ", p),
         N = factor(N, levels=c(1, 5, 100), labels=c("001", "005", "100")),
         N=str_c("N = ",N))

ggplot(binom_data, aes(x = x, y = density, fill = p)) +
  #geom_line(linewidth = 1) +
  geom_bar(stat="identity") +
  labs(title = "Binomial Distribution with Different Parameters",
       x = "x",
       y = "Density",
       color = "p") +
  theme_cowplot() +
  facet_wrap(p~N, scales="free") +
  guides(color=F, fill=F)


exponential

  • constrained to be zero or positive.
  • distance and duration, or displacement from some point of reference.
    • If the probability of an event is constant in time or across space, then the distribution of events tends towards exponential.
  • maximum entropy among all non-negative continuous distributions with the same average displacement. * described by a single parameter, the rate of events \(\lambda\), or the average displacement \(\lambda^{-1}\).
  • common to survival and event history analysis
# Create data for different rate parameters
x <- seq(0, 5, length.out = 1000)
rates <- c(0.5, 1, 2)

exp_data <- expand.grid(x = x, rate = rates) %>%
  mutate(density = dexp(x, rate),
         rate = factor(rate, labels = paste("λ =", rates)))

ggplot(exp_data, aes(x = x, y = density, color = rate)) +
  geom_line(size = 1) +
  labs(title = "Exponential Distribution with Different Rate Parameters",
       x = "x",
       y = "Density",
       color = "Rate") +
  theme_cowplot() +
  scale_color_manual(values = c("#1c5253", "#5e8485", "#0f393a"))


gamma

  • constrained to be zero or positive.
  • distance and duration, or displacement from some point of reference
  • can have a peak above zero (exponential cannot)
  • maximum entropy among all distributions with the same mean and same average logarithm
  • shape is described by two parameters (\(a\) and \(b\))
# Create data for different shape parameters
x <- seq(0, 10, length.out = 1000)
shapes <- list(c(1, 1), c(2, 2), c(5, 1))
names <- c("a=1, b=1", "a=2, b=2", "a=5, b=1")

gamma_data <- map_df(seq_along(shapes), function(i) {
  data.frame(
    x = x,
    density = dgamma(x, shape = shapes[[i]][1], rate = shapes[[i]][2]),
    params = names[i]
  )
})

ggplot(gamma_data, aes(x = x, y = density, color = params)) +
  geom_line(size = 1) +
  labs(title = "Gamma Distribution with Different Shape and Rate Parameters",
       x = "x",
       y = "Density",
       color = "Parameters") +
  theme_cowplot() +
  scale_color_manual(values = c("#1c5253", "#5e8485", "#0f393a"))


poisson

  • count-distributed (like binomial)
    • actually a special case of the binomial where \(n\) is large and \(p\) is small
  • used for counts that never get close to any theoretical maximum
  • As a special case of the binomial, it has maximum entropy under exactly the same constraints
  • described by a single parameter, the rate of events \(\lambda\)
# Create data for different lambda parameters
x <- 0:15
lambdas <- c(1, 3, 7)

pois_data <- expand.grid(x = x, lambda = lambdas) %>%
  mutate(probability = dpois(x, lambda),
         lambda = factor(lambda, labels = paste("λ =", lambdas)))

ggplot(pois_data, aes(x = x, y = probability, fill = lambda)) +
  geom_col(position = "dodge", alpha = 0.7) +
  labs(title = "Poisson Distribution with Different Rate Parameters",
       x = "Count",
       y = "Probability",
       fill = "Lambda") +
  theme_cowplot() +
  scale_fill_manual(values = c("#1c5253",  "#e07a5f", "#f2cc8f"))


motivating dataset

From McElreath’s lecture:

# generative model, basic mediator scenario
set.seed(0319)
N <- 1000 # number of applicants
# even gender distribution
G <- sample( 1:2, size=N, replace=TRUE )
# gender 1 tends to apply to department 1, 2 to 2
D <- rbinom( n=N, size=1, prob=ifelse( G==1 , 0.3 , 0.8 ) ) + 1
# matrix of acceptance rates
accept_rate <- matrix( c(0.5, 0.2, 0.1, 0.3), nrow=2)
# simulate acceptance
A <- rbinom( n=N, size=1, accept_rate[D,G])

dat <- data.frame(D=as.character(D), G=as.character(G), A)

We’re going to fit two models here, one estimating the total effect of gender on acceptance, and one estimating the direct effect stratifying on department.

\[\begin{align*} A_i &\sim \text{Bernoulli}(p_i) \\ \text{logit}(p_i) &= \alpha[G_i] \\ \alpha_j &\sim \text{Normal}(0,1) \end{align*}\]

m1 = brm(
  data = dat,
  family = bernoulli,
  A  ~ 0 + G,
  prior = c( prior( normal(0, 1), class = b)), 
  iter = 5000, warmup = 1000, chains = 4, 
  seed = 3,
  file = here("files/models/51.1")
)
## Compiling Stan program...
## Trying to compile a simple C file
## Start sampling

m1
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: A ~ 0 + G 
##    Data: dat (Number of observations: 1000) 
##   Draws: 4 chains, each with iter = 5000; warmup = 1000; thin = 1;
##          total post-warmup draws = 16000
## 
## Regression Coefficients:
##    Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## G1    -1.64      0.13    -1.90    -1.40 1.00    12575     9503
## G2    -1.04      0.10    -1.23    -0.85 1.00    14831    10819
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

\[\begin{align*} A_i &\sim \text{Bernoulli}(p_i) \\ \text{logit}(p_i) &= \alpha[G_i, D_i] \\ \alpha_j &\sim \text{Normal}(0,1) \end{align*}\]

m2 = brm(
  data = dat,
  family = bernoulli,
  A  ~ G*D,
  prior = c( prior( normal(0, 1), class = Intercept),
             prior( normal(0, 1), class = b)), 
  iter = 5000, warmup = 1000, chains = 4, 
  seed = 3,
  file = here("files/models/51.2")
)
## Compiling Stan program...
## Trying to compile a simple C file
## Start sampling

m2
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: A ~ G * D 
##    Data: dat (Number of observations: 1000) 
##   Draws: 4 chains, each with iter = 5000; warmup = 1000; thin = 1;
##          total post-warmup draws = 16000
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    -2.07      0.16    -2.39    -1.75 1.00     9026    10062
## G2            0.33      0.29    -0.25     0.88 1.00     7352     8462
## D2            1.15      0.24     0.68     1.63 1.00     7984     8930
## G2:D2        -0.33      0.35    -1.00     0.35 1.00     6300     7196
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

wait, what do these numbers mean?

posterior_summary(m1) %>% round(2)
##        Estimate Est.Error    Q2.5   Q97.5
## b_G1      -1.64      0.13   -1.90   -1.40
## b_G2      -1.04      0.10   -1.23   -0.85
## lprior    -3.74      0.23   -4.22   -3.32
## lp__    -515.12      0.99 -517.85 -514.16
posterior_summary(m2) %>% round(2)
##             Estimate Est.Error    Q2.5   Q97.5
## b_Intercept    -2.07      0.16   -2.39   -1.75
## b_G2            0.33      0.29   -0.25    0.88
## b_D2            1.15      0.24    0.68    1.63
## b_G2:D2        -0.33      0.35   -1.00    0.35
## Intercept      -1.39      0.08   -1.55   -1.23
## lprior         -5.55      0.47   -6.67   -4.84
## lp__         -502.34      1.40 -505.96 -500.60

A reminder that we’re running a LOGISTIC REGRESSION, which is a special case of a BINOMIAL REGRESSION. We have used a link function – the logit link function – to convert our outcome (probability) into something that is not bounded and can be estimated using linear equations. But now we need to go back from the logit to the probability.

\[ \text{logit} = \text{log}(\frac{p}{1-p}) \]

\[ p = \frac{\text{exp}(q)}{1 + \text{exp}(q)} \]


It doesn’t matter how you fit the model. Use add_epred_draws() to get estimated values for each combination. This also gives you the probability, not the logit. Yay!

new_dat = expand.grid(G = c("1", "2"),
                      D = c("1", "2"))

m1 %>% add_epred_draws(newdata = new_dat) %>% 
  head()
## # A tibble: 6 × 7
## # Groups:   G, D, .row [1]
##   G     D      .row .chain .iteration .draw .epred
##   <fct> <fct> <int>  <int>      <int> <int>  <dbl>
## 1 1     1         1     NA         NA     1  0.149
## 2 1     1         1     NA         NA     2  0.141
## 3 1     1         1     NA         NA     3  0.153
## 4 1     1         1     NA         NA     4  0.174
## 5 1     1         1     NA         NA     5  0.186
## 6 1     1         1     NA         NA     6  0.175

Use add_epred_draws to get the posterior distribution for p for each combination of department and gender.

m1 %>% add_epred_draws(newdata = new_dat) %>% 
  median_qi
## # A tibble: 4 × 9
##   G     D      .row .epred .lower .upper .width .point .interval
##   <fct> <fct> <int>  <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 1     1         1  0.163  0.130  0.197   0.95 median qi       
## 2 1     2         3  0.163  0.130  0.197   0.95 median qi       
## 3 2     1         2  0.261  0.226  0.299   0.95 median qi       
## 4 2     2         4  0.261  0.226  0.299   0.95 median qi
m2 %>% add_epred_draws(newdata = new_dat) %>% 
  median_qi
## # A tibble: 4 × 9
##   G     D      .row .epred .lower .upper .width .point .interval
##   <fct> <fct> <int>  <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 1     1         1  0.113 0.0836  0.148   0.95 median qi       
## 2 1     2         3  0.287 0.217   0.367   0.95 median qi       
## 3 2     1         2  0.151 0.0943  0.221   0.95 median qi       
## 4 2     2         4  0.287 0.246   0.330   0.95 median qi

Our other useful function add_predicted_draws() is back in the realm of the outcome: 1’s and 0’s.

m1 %>% add_predicted_draws(newdata = new_dat) %>% 
  head
## # A tibble: 6 × 7
## # Groups:   G, D, .row [1]
##   G     D      .row .chain .iteration .draw .prediction
##   <fct> <fct> <int>  <int>      <int> <int>       <int>
## 1 1     1         1     NA         NA     1           0
## 2 1     1         1     NA         NA     2           0
## 3 1     1         1     NA         NA     3           0
## 4 1     1         1     NA         NA     4           0
## 5 1     1         1     NA         NA     5           0
## 6 1     1         1     NA         NA     6           1

Use add_predicted_draws() to simulate fictional people in these departments:

m1 %>% add_predicted_draws(newdata = new_dat) %>% 
  ungroup() %>% 
  count(G, D, .prediction)
## # A tibble: 8 × 4
##   G     D     .prediction     n
##   <fct> <fct>       <int> <int>
## 1 1     1               0 13363
## 2 1     1               1  2637
## 3 1     2               0 13357
## 4 1     2               1  2643
## 5 2     1               0 11800
## 6 2     1               1  4200
## 7 2     2               0 11796
## 8 2     2               1  4204
m2 %>% add_predicted_draws(newdata = new_dat) %>% 
  ungroup() %>% 
  count(G, D, .prediction)
## # A tibble: 8 × 4
##   G     D     .prediction     n
##   <fct> <fct>       <int> <int>
## 1 1     1               0 14178
## 2 1     1               1  1822
## 3 1     2               0 11463
## 4 1     2               1  4537
## 5 2     1               0 13527
## 6 2     1               1  2473
## 7 2     2               0 11351
## 8 2     2               1  4649

m1_epred = add_epred_draws(new_dat, m1)
m1_pred = add_predicted_draws(new_dat, m1)
full_join(m1_epred, m1_pred) %>% 
  pivot_longer(cols = c(".epred", ".prediction")) %>% 
  ggplot(aes(x = value)) +
  geom_histogram(position="dodge") +
  facet_grid(G~name, scales="free")
## Joining with `by = join_by(G, D, .row, .chain, .iteration, .draw)`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


exercise

Using the data UCBadmit from the rethinking package, fit

  1. a model estimating the total effect of gender on acceptance, and
  2. a model estimating the direct effect of gender stratifying by department.

Hint: these data are given in the aggregated form, not the long form. You’ll need to use a binomial distribution, not a bernoulli. And you’ll need to specify the number of trials. So your formula will look like this:

admit | trials(applications) ~ [predictors go here]

solution

data(UCBadmit, package = "rethinking")

UCBadmit$gender = UCBadmit$applicant.gender

m3 <- brm(
  data = UCBadmit,
  family = binomial,
  admit | trials(applications) ~ 0 + gender, 
  prior = c( prior( normal(0, 2), class = b) ),
  iter = 5000, warmup = 1000, chains = 4, 
  seed = 3,
  file = here("files/models/51.3")
)
## Compiling Stan program...
## Trying to compile a simple C file
## Start sampling

solution

m4 <- brm(
  data = UCBadmit,
  family = binomial,
  admit | trials(applications) ~ gender*dept, 
  prior = c( prior( normal(0, 2), class = Intercept),
             prior( normal(0, 2), class = b) ),
  iter = 5000, warmup = 1000, chains = 4, 
  seed = 3,
  file = here("files/models/51.4")
)
## Compiling Stan program...
## Trying to compile a simple C file
## Start sampling

exercise

Explore your options with get_variables() and use the spread draws() function to sample from the posterior distribution of one of your models. What does this return?


get_variables(m3)
##  [1] "b_genderfemale" "b_gendermale"   "lprior"         "lp__"          
##  [5] "accept_stat__"  "stepsize__"     "treedepth__"    "n_leapfrog__"  
##  [9] "divergent__"    "energy__"
m3 %>% 
  spread_draws(b_genderfemale, b_gendermale) 
## # A tibble: 16,000 × 5
##    .chain .iteration .draw b_genderfemale b_gendermale
##     <int>      <int> <int>          <dbl>        <dbl>
##  1      1          1     1         -0.914       -0.169
##  2      1          2     2         -0.757       -0.268
##  3      1          3     3         -0.742       -0.274
##  4      1          4     4         -0.697       -0.284
##  5      1          5     5         -0.940       -0.200
##  6      1          6     6         -0.841       -0.247
##  7      1          7     7         -0.828       -0.171
##  8      1          8     8         -0.810       -0.241
##  9      1          9     9         -0.814       -0.294
## 10      1         10    10         -0.783       -0.140
## # ℹ 15,990 more rows

get_variables(m4)
##  [1] "b_Intercept"        "b_gendermale"       "b_deptB"           
##  [4] "b_deptC"            "b_deptD"            "b_deptE"           
##  [7] "b_deptF"            "b_gendermale:deptB" "b_gendermale:deptC"
## [10] "b_gendermale:deptD" "b_gendermale:deptE" "b_gendermale:deptF"
## [13] "Intercept"          "lprior"             "lp__"              
## [16] "accept_stat__"      "stepsize__"         "treedepth__"       
## [19] "n_leapfrog__"       "divergent__"        "energy__"
m4 %>% 
  spread_draws(b_Intercept, b_gendermale, b_deptB, b_deptC, 
               b_deptD, b_deptE, b_deptF, 
               `b_gendermale:deptB`,`b_gendermale:deptC`, 
               `b_gendermale:deptD`, `b_gendermale:deptE`,
               `b_gendermale:deptF`) 
## # A tibble: 16,000 × 15
##    .chain .iteration .draw b_Intercept b_gendermale b_deptB b_deptC b_deptD
##     <int>      <int> <int>       <dbl>        <dbl>   <dbl>   <dbl>   <dbl>
##  1      1          1     1        1.23       -0.756 -1.28     -1.83   -1.62
##  2      1          2     2        1.16       -0.744 -0.700    -1.65   -1.69
##  3      1          3     3        1.33       -0.759 -0.133    -1.90   -1.99
##  4      1          4     4        1.12       -0.544  0.0159   -1.65   -1.60
##  5      1          5     5        1.21       -0.707 -0.648    -1.88   -1.86
##  6      1          6     6        1.36       -0.826 -0.433    -2.02   -1.89
##  7      1          7     7        1.13       -0.643 -0.656    -1.89   -1.80
##  8      1          8     8        1.20       -0.701 -0.461    -1.78   -1.61
##  9      1          9     9        1.03       -0.602 -0.758    -1.70   -1.35
## 10      1         10    10        1.12       -0.509 -0.605    -1.71   -1.72
## # ℹ 15,990 more rows
## # ℹ 7 more variables: b_deptE <dbl>, b_deptF <dbl>, `b_gendermale:deptB` <dbl>,
## #   `b_gendermale:deptC` <dbl>, `b_gendermale:deptD` <dbl>,
## #   `b_gendermale:deptE` <dbl>, `b_gendermale:deptF` <dbl>

exercise

Use the add_epred_draws() function to estimate expected values based on your models. What is the unit on .epred for this model?


solution

UCBadmit %>% add_epred_draws(m3) %>% 
  ungroup() %>% 
  group_by(gender) %>% 
  mean_qi(.epred)
## # A tibble: 2 × 7
##   gender .epred .lower .upper .width .point .interval
##   <fct>   <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 female   92.9   7.32   187.   0.95 mean   qi       
## 2 male    200.   83.1    376.   0.95 mean   qi

exercise

Recreate this figure (from the model without department): this figure shows that women applicants are accepted at a rate .09 to .18 lower than male applicants.


solution

m3 %>% 
  spread_draws(b_genderfemale, b_gendermale) %>% 
  mutate(
    across(starts_with("b"), 
           rethinking::inv_logit),
    diff = b_genderfemale - b_gendermale)  %>% 
  ggplot(aes(x = diff)) +
  geom_density(color ="#1c5253", linewidth = 2) +
  labs(x = "P_female - P_male")

exercise

Recreate this figure (from the model with department).


new_dat = distinct(UCBadmit, gender, dept) %>% 
  mutate(applications = 1e5) 
add_epred_draws(new_dat, m4) %>% 
  ungroup() %>% 
  select(dept, gender, .draw, .epred) %>% 
  pivot_wider(names_from = gender, values_from = .epred) %>% 
  mutate(diff = (female-male)/1e5) %>% 
  ggplot(aes(x = diff, color = dept)) +
  geom_density(linewidth = 2) +
  labs(x = "P_female - P_male") +
  guides(color = "none")

exercise

Calculate the probability of acceptance for each gender within each department for both your models. Plot the results.


solution

m3_epred = UCBadmit %>% add_epred_draws(m3) %>% 
  mutate(prob = .epred/applications) %>% 
  median_qi(prob) %>% 
  mutate(model = "m3") 
m4_epred = UCBadmit %>% add_epred_draws(m4) %>% 
mutate(prob = .epred/applications) %>% 
    median_qi(prob) %>% 
    mutate(model = "m4") 

m3_epred %>% full_join(m4_epred) %>% 
  ggplot(
    aes(x = dept, y = prob, color = gender)
  ) +
  geom_errorbar(
    aes(ymin = .lower, ymax=.upper),
    width = .5) +
  geom_point() +
  facet_wrap(~model)
## Joining with `by = join_by(dept, applicant.gender, admit, reject, applications,
## gender, .row, prob, .lower, .upper, .width, .point, .interval, model)`