library(tidyverse)
library(psych)
library(cowplot)
library(patchwork)
library(here)
library(brms) 
library(tidybayes) 

entropy

Entropy is a measure of uncertainty. Distributions with high entropy have more uncertainty because they have more possibilities.

# Create example data
set.seed(123)

# Low entropy data (concentrated around a single value)
low_entropy <- data.frame(
  value = sample(x = c(1:6), 10000, replace = T, prob = c(.3, .25, .2, .1, .05, 0)),
  type = "Low Entropy"
)

# High entropy data (more spread out, more uniform)
high_entropy <- data.frame(
  value = sample(x = c(1:6), 10000, replace = T),
  type = "High Entropy"
)

# Combine the data
entropy_data <- rbind(low_entropy, high_entropy)

# Create the plots
ggplot(entropy_data, aes(x = value)) +
  geom_histogram(fill = "#1c5253", alpha = 0.5, binwidth = 1, color = "white") +
  facet_wrap(~type) +
  labs(title = "Comparing High and Low Entropy Distributions",
       x = "Value",
       y = "Density") +
  theme_cowplot() +
  theme(strip.background = element_rect(fill = "#1c5253"),
        strip.text = element_text(color = "white", size = 12))

When you choose a prior and a likelihood function for your data – the distribution representing the behavior of your outcome – the best choice is the distribution that maximizes entropy while still meeting the constraints of your variable.


maximum entropy


what are your options?

So many! But here are a few:


binomial

  • outcome of each trial is binary (yes/no, happened/didn’t happen)
  • fixed number of trials
    • A Bernoulli model is a special case of the binomial with only 1 trial
  • probability of outcome is the same across trials
  • trials are independent

In general, this distribution is your best bet for binary outcomes.


# Create data for different rate parameters
x <- seq(0, 100, by=1)
N <- c(1, 5, 100)
p = c(.25, .40, .70)

binom_data <- expand.grid(x = x, N=N, p=p) %>%
  filter(x <= N) %>% 
  mutate(density = dbinom(x, prob = p, size=N),
         p=str_c("p = ", p),
         N = factor(N, levels=c(1, 5, 100), labels=c("001", "005", "100")),
         N=str_c("N = ",N))

ggplot(binom_data, aes(x = x, y = density, fill = p)) +
  #geom_line(linewidth = 1) +
  geom_bar(stat="identity") +
  labs(title = "Binomial Distribution with Different Parameters",
       x = "x",
       y = "Density",
       color = "p") +
  theme_cowplot() +
  facet_wrap(p~N, scales="free") +
  guides(color=F, fill=F)


exponential

  • constrained to be zero or positive.
  • distance and duration, or displacement from some point of reference.
    • If the probability of an event is constant in time or across space, then the distribution of events tends towards exponential.
  • maximum entropy among all non-negative continuous distributions with the same average displacement. * described by a single parameter, the rate of events λ, or the average displacement λ−1.
  • common to survival and event history analysis
# Create data for different rate parameters
x <- seq(0, 5, length.out = 1000)
rates <- c(0.5, 1, 2)

exp_data <- expand.grid(x = x, rate = rates) %>%
  mutate(density = dexp(x, rate),
         rate = factor(rate, labels = paste("λ =", rates)))

ggplot(exp_data, aes(x = x, y = density, color = rate)) +
  geom_line(size = 1) +
  labs(title = "Exponential Distribution with Different Rate Parameters",
       x = "x",
       y = "Density",
       color = "Rate") +
  theme_cowplot() +
  scale_color_manual(values = c("#1c5253", "#5e8485", "#0f393a"))


gamma

  • constrained to be zero or positive.
  • distance and duration, or displacement from some point of reference
  • can have a peak above zero (exponential cannot)
  • maximum entropy among all distributions with the same mean and same average logarithm
  • shape is described by two parameters (a and b)
# Create data for different shape parameters
x <- seq(0, 10, length.out = 1000)
shapes <- list(c(1, 1), c(2, 2), c(5, 1))
names <- c("a=1, b=1", "a=2, b=2", "a=5, b=1")

gamma_data <- map_df(seq_along(shapes), function(i) {
  data.frame(
    x = x,
    density = dgamma(x, shape = shapes[[i]][1], rate = shapes[[i]][2]),
    params = names[i]
  )
})

ggplot(gamma_data, aes(x = x, y = density, color = params)) +
  geom_line(size = 1) +
  labs(title = "Gamma Distribution with Different Shape and Rate Parameters",
       x = "x",
       y = "Density",
       color = "Parameters") +
  theme_cowplot() +
  scale_color_manual(values = c("#1c5253", "#5e8485", "#0f393a"))


poisson

  • count-distributed (like binomial)
    • actually a special case of the binomial where n is large and p is small
  • used for counts that never get close to any theoretical maximum
  • As a special case of the binomial, it has maximum entropy under exactly the same constraints
  • described by a single parameter, the rate of events λ
# Create data for different lambda parameters
x <- 0:15
lambdas <- c(1, 3, 7)

pois_data <- expand.grid(x = x, lambda = lambdas) %>%
  mutate(probability = dpois(x, lambda),
         lambda = factor(lambda, labels = paste("λ =", lambdas)))

ggplot(pois_data, aes(x = x, y = probability, fill = lambda)) +
  geom_col(position = "dodge", alpha = 0.7) +
  labs(title = "Poisson Distribution with Different Rate Parameters",
       x = "Count",
       y = "Probability",
       fill = "Lambda") +
  theme_cowplot() +
  scale_fill_manual(values = c("#1c5253",  "#e07a5f", "#f2cc8f"))


motivating dataset

From McElreath’s lecture:

# generative model, basic mediator scenario
set.seed(0319)
N <- 1000 # number of applicants
# even gender distribution
G <- sample( 1:2, size=N, replace=TRUE )
# gender 1 tends to apply to department 1, 2 to 2
D <- rbinom( n=N, size=1, prob=ifelse( G==1 , 0.3 , 0.8 ) ) + 1
# matrix of acceptance rates
accept_rate <- matrix( c(0.5, 0.2, 0.1, 0.3), nrow=2)
# simulate acceptance
A <- rbinom( n=N, size=1, accept_rate[D,G])

dat <- data.frame(D=as.character(D), G=as.character(G), A)

We’re going to fit two models here, one estimating the total effect of gender on acceptance, and one estimating the direct effect stratifying on department.

Ai∼Bernoulli(pi)logit(pi)=α[Gi]αj∼Normal(0,1)

m1 = brm(
  data = dat,
  family = bernoulli,
  A  ~ 0 + G,
  prior = c( prior( normal(0, 1), class = b)), 
  iter = 5000, warmup = 1000, chains = 4, 
  seed = 3,
  file = here("files/models/51.1")
)
## Compiling Stan program...
## Trying to compile a simple C file
## Start sampling

m1
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: A ~ 0 + G 
##    Data: dat (Number of observations: 1000) 
##   Draws: 4 chains, each with iter = 5000; warmup = 1000; thin = 1;
##          total post-warmup draws = 16000
## 
## Regression Coefficients:
##    Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## G1    -1.64      0.13    -1.90    -1.40 1.00    12575     9503
## G2    -1.04      0.10    -1.23    -0.85 1.00    14831    10819
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Ai∼Bernoulli(pi)logit(pi)=α[Gi,Di]αj∼Normal(0,1)

m2 = brm(
  data = dat,
  family = bernoulli,
  A  ~ G*D,
  prior = c( prior( normal(0, 1), class = Intercept),
             prior( normal(0, 1), class = b)), 
  iter = 5000, warmup = 1000, chains = 4, 
  seed = 3,
  file = here("files/models/51.2")
)
## Compiling Stan program...
## Trying to compile a simple C file
## Start sampling

m2
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: A ~ G * D 
##    Data: dat (Number of observations: 1000) 
##   Draws: 4 chains, each with iter = 5000; warmup = 1000; thin = 1;
##          total post-warmup draws = 16000
## 
## Regression Coefficients:
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    -2.07      0.16    -2.39    -1.75 1.00     9026    10062
## G2            0.33      0.29    -0.25     0.88 1.00     7352     8462
## D2            1.15      0.24     0.68     1.63 1.00     7984     8930
## G2:D2        -0.33      0.35    -1.00     0.35 1.00     6300     7196
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

wait, what do these numbers mean?

posterior_summary(m1) %>% round(2)
##        Estimate Est.Error    Q2.5   Q97.5
## b_G1      -1.64      0.13   -1.90   -1.40
## b_G2      -1.04      0.10   -1.23   -0.85
## lprior    -3.74      0.23   -4.22   -3.32
## lp__    -515.12      0.99 -517.85 -514.16
posterior_summary(m2) %>% round(2)
##             Estimate Est.Error    Q2.5   Q97.5
## b_Intercept    -2.07      0.16   -2.39   -1.75
## b_G2            0.33      0.29   -0.25    0.88
## b_D2            1.15      0.24    0.68    1.63
## b_G2:D2        -0.33      0.35   -1.00    0.35
## Intercept      -1.39      0.08   -1.55   -1.23
## lprior         -5.55      0.47   -6.67   -4.84
## lp__         -502.34      1.40 -505.96 -500.60

A reminder that we’re running a LOGISTIC REGRESSION, which is a special case of a BINOMIAL REGRESSION. We have used a link function – the logit link function – to convert our outcome (probability) into something that is not bounded and can be estimated using linear equations. But now we need to go back from the logit to the probability.

logit=log(p1−p)

p=exp(q)1+exp(q)


It doesn’t matter how you fit the model. Use add_epred_draws() to get estimated values for each combination. This also gives you the probability, not the logit. Yay!

new_dat = expand.grid(G = c("1", "2"),
                      D = c("1", "2"))

m1 %>% add_epred_draws(newdata = new_dat) %>% 
  head()
## # A tibble: 6 × 7
## # Groups:   G, D, .row [1]
##   G     D      .row .chain .iteration .draw .epred
##   <fct> <fct> <int>  <int>      <int> <int>  <dbl>
## 1 1     1         1     NA         NA     1  0.149
## 2 1     1         1     NA         NA     2  0.141
## 3 1     1         1     NA         NA     3  0.153
## 4 1     1         1     NA         NA     4  0.174
## 5 1     1         1     NA         NA     5  0.186
## 6 1     1         1     NA         NA     6  0.175

Use add_epred_draws to get the posterior distribution for p for each combination of department and gender.

m1 %>% add_epred_draws(newdata = new_dat) %>% 
  median_qi
## # A tibble: 4 × 9
##   G     D      .row .epred .lower .upper .width .point .interval
##   <fct> <fct> <int>  <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 1     1         1  0.163  0.130  0.197   0.95 median qi       
## 2 1     2         3  0.163  0.130  0.197   0.95 median qi       
## 3 2     1         2  0.261  0.226  0.299   0.95 median qi       
## 4 2     2         4  0.261  0.226  0.299   0.95 median qi
m2 %>% add_epred_draws(newdata = new_dat) %>% 
  median_qi
## # A tibble: 4 × 9
##   G     D      .row .epred .lower .upper .width .point .interval
##   <fct> <fct> <int>  <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 1     1         1  0.113 0.0836  0.148   0.95 median qi       
## 2 1     2         3  0.287 0.217   0.367   0.95 median qi       
## 3 2     1         2  0.151 0.0943  0.221   0.95 median qi       
## 4 2     2         4  0.287 0.246   0.330   0.95 median qi

Our other useful function add_predicted_draws() is back in the realm of the outcome: 1’s and 0’s.

m1 %>% add_predicted_draws(newdata = new_dat) %>% 
  head
## # A tibble: 6 × 7
## # Groups:   G, D, .row [1]
##   G     D      .row .chain .iteration .draw .prediction
##   <fct> <fct> <int>  <int>      <int> <int>       <int>
## 1 1     1         1     NA         NA     1           0
## 2 1     1         1     NA         NA     2           0
## 3 1     1         1     NA         NA     3           0
## 4 1     1         1     NA         NA     4           0
## 5 1     1         1     NA         NA     5           0
## 6 1     1         1     NA         NA     6           1

Use add_predicted_draws() to simulate fictional people in these departments:

m1 %>% add_predicted_draws(newdata = new_dat) %>% 
  ungroup() %>% 
  count(G, D, .prediction)
## # A tibble: 8 × 4
##   G     D     .prediction     n
##   <fct> <fct>       <int> <int>
## 1 1     1               0 13363
## 2 1     1               1  2637
## 3 1     2               0 13357
## 4 1     2               1  2643
## 5 2     1               0 11800
## 6 2     1               1  4200
## 7 2     2               0 11796
## 8 2     2               1  4204
m2 %>% add_predicted_draws(newdata = new_dat) %>% 
  ungroup() %>% 
  count(G, D, .prediction)
## # A tibble: 8 × 4
##   G     D     .prediction     n
##   <fct> <fct>       <int> <int>
## 1 1     1               0 14178
## 2 1     1               1  1822
## 3 1     2               0 11463
## 4 1     2               1  4537
## 5 2     1               0 13527
## 6 2     1               1  2473
## 7 2     2               0 11351
## 8 2     2               1  4649

m1_epred = add_epred_draws(new_dat, m1)
m1_pred = add_predicted_draws(new_dat, m1)
full_join(m1_epred, m1_pred) %>% 
  pivot_longer(cols = c(".epred", ".prediction")) %>% 
  ggplot(aes(x = value)) +
  geom_histogram(position="dodge") +
  facet_grid(G~name, scales="free")
## Joining with `by = join_by(G, D, .row, .chain, .iteration, .draw)`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


exercise

Using the data UCBadmit from the rethinking package, fit

  1. a model estimating the total effect of gender on acceptance, and
  2. a model estimating the direct effect of gender stratifying by department.

Hint: these data are given in the aggregated form, not the long form. You’ll need to use a binomial distribution, not a bernoulli. And you’ll need to specify the number of trials. So your formula will look like this:

admit | trials(applications) ~ [predictors go here]

solution

data(UCBadmit, package = "rethinking")

UCBadmit$gender = UCBadmit$applicant.gender

m3 <- brm(
  data = UCBadmit,
  family = binomial,
  admit | trials(applications) ~ 0 + gender, 
  prior = c( prior( normal(0, 2), class = b) ),
  iter = 5000, warmup = 1000, chains = 4, 
  seed = 3,
  file = here("files/models/51.3")
)
## Compiling Stan program...
## Trying to compile a simple C file
## Start sampling

solution

m4 <- brm(
  data = UCBadmit,
  family = binomial,
  admit | trials(applications) ~ gender*dept, 
  prior = c( prior( normal(0, 2), class = Intercept),
             prior( normal(0, 2), class = b) ),
  iter = 5000, warmup = 1000, chains = 4, 
  seed = 3,
  file = here("files/models/51.4")
)
## Compiling Stan program...
## Trying to compile a simple C file
## Start sampling

exercise

Explore your options with get_variables() and use the spread draws() function to sample from the posterior distribution of one of your models. What does this return?


get_variables(m3)
##  [1] "b_genderfemale" "b_gendermale"   "lprior"         "lp__"          
##  [5] "accept_stat__"  "stepsize__"     "treedepth__"    "n_leapfrog__"  
##  [9] "divergent__"    "energy__"
m3 %>% 
  spread_draws(b_genderfemale, b_gendermale) 
## # A tibble: 16,000 × 5
##    .chain .iteration .draw b_genderfemale b_gendermale
##     <int>      <int> <int>          <dbl>        <dbl>
##  1      1          1     1         -0.914       -0.169
##  2      1          2     2         -0.757       -0.268
##  3      1          3     3         -0.742       -0.274
##  4      1          4     4         -0.697       -0.284
##  5      1          5     5         -0.940       -0.200
##  6      1          6     6         -0.841       -0.247
##  7      1          7     7         -0.828       -0.171
##  8      1          8     8         -0.810       -0.241
##  9      1          9     9         -0.814       -0.294
## 10      1         10    10         -0.783       -0.140
## # ℹ 15,990 more rows

get_variables(m4)
##  [1] "b_Intercept"        "b_gendermale"       "b_deptB"           
##  [4] "b_deptC"            "b_deptD"            "b_deptE"           
##  [7] "b_deptF"            "b_gendermale:deptB" "b_gendermale:deptC"
## [10] "b_gendermale:deptD" "b_gendermale:deptE" "b_gendermale:deptF"
## [13] "Intercept"          "lprior"             "lp__"              
## [16] "accept_stat__"      "stepsize__"         "treedepth__"       
## [19] "n_leapfrog__"       "divergent__"        "energy__"
m4 %>% 
  spread_draws(b_Intercept, b_gendermale, b_deptB, b_deptC, 
               b_deptD, b_deptE, b_deptF, 
               `b_gendermale:deptB`,`b_gendermale:deptC`, 
               `b_gendermale:deptD`, `b_gendermale:deptE`,
               `b_gendermale:deptF`) 
## # A tibble: 16,000 × 15
##    .chain .iteration .draw b_Intercept b_gendermale b_deptB b_deptC b_deptD
##     <int>      <int> <int>       <dbl>        <dbl>   <dbl>   <dbl>   <dbl>
##  1      1          1     1        1.23       -0.756 -1.28     -1.83   -1.62
##  2      1          2     2        1.16       -0.744 -0.700    -1.65   -1.69
##  3      1          3     3        1.33       -0.759 -0.133    -1.90   -1.99
##  4      1          4     4        1.12       -0.544  0.0159   -1.65   -1.60
##  5      1          5     5        1.21       -0.707 -0.648    -1.88   -1.86
##  6      1          6     6        1.36       -0.826 -0.433    -2.02   -1.89
##  7      1          7     7        1.13       -0.643 -0.656    -1.89   -1.80
##  8      1          8     8        1.20       -0.701 -0.461    -1.78   -1.61
##  9      1          9     9        1.03       -0.602 -0.758    -1.70   -1.35
## 10      1         10    10        1.12       -0.509 -0.605    -1.71   -1.72
## # ℹ 15,990 more rows
## # ℹ 7 more variables: b_deptE <dbl>, b_deptF <dbl>, `b_gendermale:deptB` <dbl>,
## #   `b_gendermale:deptC` <dbl>, `b_gendermale:deptD` <dbl>,
## #   `b_gendermale:deptE` <dbl>, `b_gendermale:deptF` <dbl>

exercise

Use the add_epred_draws() function to estimate expected values based on your models. What is the unit on .epred for this model?


solution

UCBadmit %>% add_epred_draws(m3) %>% 
  ungroup() %>% 
  group_by(gender) %>% 
  mean_qi(.epred)
## # A tibble: 2 × 7
##   gender .epred .lower .upper .width .point .interval
##   <fct>   <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 female   92.9   7.32   187.   0.95 mean   qi       
## 2 male    200.   83.1    376.   0.95 mean   qi

exercise

Recreate this figure (from the model without department): this figure shows that women applicants are accepted at a rate .09 to .18 lower than male applicants.


solution

m3 %>% 
  spread_draws(b_genderfemale, b_gendermale) %>% 
  mutate(
    across(starts_with("b"), 
           rethinking::inv_logit),
    diff = b_genderfemale - b_gendermale)  %>% 
  ggplot(aes(x = diff)) +
  geom_density(color ="#1c5253", linewidth = 2) +
  labs(x = "P_female - P_male")

exercise

Recreate this figure (from the model with department).


new_dat = distinct(UCBadmit, gender, dept) %>% 
  mutate(applications = 1e5) 
add_epred_draws(new_dat, m4) %>% 
  ungroup() %>% 
  select(dept, gender, .draw, .epred) %>% 
  pivot_wider(names_from = gender, values_from = .epred) %>% 
  mutate(diff = (female-male)/1e5) %>% 
  ggplot(aes(x = diff, color = dept)) +
  geom_density(linewidth = 2) +
  labs(x = "P_female - P_male") +
  guides(color = "none")

exercise

Calculate the probability of acceptance for each gender within each department for both your models. Plot the results.


solution

m3_epred = UCBadmit %>% add_epred_draws(m3) %>% 
  mutate(prob = .epred/applications) %>% 
  median_qi(prob) %>% 
  mutate(model = "m3") 
m4_epred = UCBadmit %>% add_epred_draws(m4) %>% 
mutate(prob = .epred/applications) %>% 
    median_qi(prob) %>% 
    mutate(model = "m4") 

m3_epred %>% full_join(m4_epred) %>% 
  ggplot(
    aes(x = dept, y = prob, color = gender)
  ) +
  geom_errorbar(
    aes(ymin = .lower, ymax=.upper),
    width = .5) +
  geom_point() +
  facet_wrap(~model)
## Joining with `by = join_by(dept, applicant.gender, admit, reject, applications,
## gender, .row, prob, .lower, .upper, .width, .point, .interval, model)`