7 Uncertainty

Prerequisites

We will use these package in this chapter:

library("tidyverse")
library("forcats")
library("lubridate")
library("stringr")
library("modelr")
library("broom")
library("purrr")

7.1 Estimation

7.1.1 Unbiasedness and Consistency

In these simulations, we draw a sample of size n from normal distributions with means mu0 and mu1 and standard deviations sd0 and sd1 respectively,

n <- 100
mu0 <- 0
sd0 <- 1
mu1 <- 1
sd1 <- 1
smpl <- tibble(id = seq_len(n),
               # Y if not treated
               Y0 = rnorm(n, mean = mu0, sd = sd0),
               # Y if treated
               Y1 = rnorm(n, mean = mu1, sd = sd1),
               # individual treatment effects
               tau = Y1 - Y0)

The SATE is:

SATE <- mean(smpl[["tau"]])
SATE
#> [1] 0.952

Simulations of RCTs. Write a function that takes the sample as an input smpl, then randomly assigns the treatment to each individual:

sim_treat <- function(smpl) {
  n <- nrow(smpl)
  SATE <- mean(smpl[["tau"]])
  # indexes of obs receiving treatment
  idx <- sample(seq_len(n), floor(nrow(smpl) / 2), replace = FALSE)
  # treat variable are those receiving treatment, else 0
  smpl[["treat"]] <- as.integer(seq_len(nrow(smpl)) %in% idx)
  smpl %>%
    mutate(Y_obs = if_else(treat == 1L, Y1, Y0)) %>%
    group_by(treat) %>%
    summarise(Y_obs = mean(Y_obs)) %>%
    spread(treat, Y_obs) %>%
    rename(Y1_mean = `1`, Y0_mean = `0`) %>%
    mutate(diff_mean = Y1_mean - Y0_mean,
           est_error = diff_mean - SATE)
}

This returns a data frame with columns: Y0_mean (mean \(Y\) for observations not receiving the treatment), Y1_mean (mean \(Y\) for observations receiving the treatment), diff (difference in mean values between the two groups), and est_error (difference between the estimated difference and the known SATE):

sim_treat(smpl)
#> # A tibble: 1 x 4
#>   Y0_mean Y1_mean diff_mean est_error
#>     <dbl>   <dbl>     <dbl>     <dbl>
#> 1 -0.0570   0.875     0.932   -0.0207

Rerun this function sims times:

sims <- 5000
sate_sims <- map_df(seq_len(sims), ~ sim_treat(smpl))
summary(sate_sims[["est_error"]])
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  -0.484  -0.087  -0.002  -0.001   0.085   0.414

Simulate the PATE:

PATE <- mu1 - mu0

sim_pate <- function(n, mu0, mu1, sd0, sd1) {
  smpl <- tibble(Y0 = rnorm(n, mean = mu0, sd = sd0),
                 Y1 = rnorm(n, mean = mu1, sd = sd1),
                 tau = Y1 - Y0)
  # indexes of obs receiving treatment
  idx <- sample(seq_len(n), floor(nrow(smpl) / 2), replace = FALSE)
  # treat variable are those receiving treatment, else 0
  smpl[["treat"]] <- as.integer(seq_len(nrow(smpl)) %in% idx)
  smpl %>%
    mutate(Y_obs = if_else(treat == 1L, Y1, Y0)) %>%
    group_by(treat) %>%
    summarise(Y_obs = mean(Y_obs)) %>%
    spread(treat, Y_obs) %>%
    rename(Y1_mean = `1`, Y0_mean = `0`) %>%
    mutate(diff_mean = Y1_mean - Y0_mean,
           est_error = diff_mean - PATE)
}

Example of one simulation

sim_pate(n, mu0, mu1, sd0, sd1)
#> # A tibble: 1 x 4
#>   Y0_mean Y1_mean diff_mean est_error
#>     <dbl>   <dbl>     <dbl>     <dbl>
#> 1  -0.109   0.925      1.03    0.0338

pate_sims <-
  map_df(seq_len(sims), ~ sim_pate(n, mu0, mu1, sd0, sd1))
summary(pate_sims[["est_error"]])
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  -0.677  -0.135  -0.004  -0.003   0.129   0.853

7.1.2 Standard Error

Plot of the sampling distribution of the difference-in-means estimator:

ggplot(pate_sims, aes(x = diff_mean, y = ..density..)) +
  geom_histogram(binwidth = 0.01, boundary = 1) +
  geom_vline(xintercept = PATE, colour = "white", size = 2) +
  ggtitle("Sampling distribution") +
  labs(x = "Difference-in-means estimator")

sd(pate_sims[["diff_mean"]])
#> [1] 0.196

Simulate PATE with a standard error:

sim_pate_se <- function(n, mu0, mu1, sd0, sd1) {
  # PATE - difference in means
  PATE <- mu1 - mu0
  # sample
  smpl <- tibble(Y0 = rnorm(n, mean = mu0, sd = sd0),
                 Y1 = rnorm(n, mean = mu1, sd = sd1),
                 tau = Y1 - Y0)
  # indexes of obs receiving treatment
  idx <- sample(seq_len(n), floor(nrow(smpl) / 2), replace = FALSE)
  # treat variable are those receiving treatment, else 0
  smpl[["treat"]] <- as.integer(seq_len(nrow(smpl)) %in% idx)
  # sample
  smpl %>%
    mutate(Y_obs = if_else(treat == 1L, Y1, Y0)) %>%
    group_by(treat) %>%
    summarise(mean = mean(Y_obs),
              var = var(Y_obs),
              nobs = n()) %>%
    summarise(diff_mean = diff(mean),
              se = sqrt(sum(var / nobs)),
              est_error = diff_mean - PATE)
}

Run a single simulation:

sim_pate_se(n, mu0, mu1, sd0, sd1)
#> # A tibble: 1 x 3
#>   diff_mean    se est_error
#>       <dbl> <dbl>     <dbl>
#> 1      1.05 0.190    0.0539

Run sims simulations:

sims <- 5000
pate_sims_se <-
  map_df(seq_len(sims), ~ sim_pate_se(n, mu0, mu1, sd0, sd1))

Standard deviation of difference-in-means

sd(pate_sims_se[["diff_mean"]])
#> [1] 0.199

Mean of standard errors,

mean(pate_sims_se[["se"]])
#> [1] 0.2

7.1.3 Confidence Intervals

Calculate a \(p\%\) confidence interval for the binomial distribution:

# Sample size
n <- 1000
# point estimate
x_bar <- 0.6
# standard error
se <- sqrt(x_bar * (1 - x_bar) / n)
# Desired Confidence levels
levels <- c(0.99, 0.95, 0.90)

tibble(level = levels) %>%
  mutate(
    ci_lower = x_bar - qnorm(1 - (1 - level) / 2) * se,
    ci_upper = x_bar + qnorm(1 - (1 - level) / 2) * se
  )
#> # A tibble: 3 x 3
#>   level ci_lower ci_upper
#>   <dbl>    <dbl>    <dbl>
#> 1  0.99    0.560    0.640
#> 2  0.95    0.570    0.630
#> 3  0.9     0.575    0.625

Calculate the coverage ratio of the 95% confidence interval in the PATE simulations.

level <- 0.95
pate_sims_se %>%
  mutate(ci_lower = diff_mean - qnorm(1 - (1 - level) / 2) * se,
         ci_upper = diff_mean + qnorm(1 - (1 - level) / 2) * se,
         includes_pate = PATE > ci_lower & PATE < ci_upper) %>%
  summarise(coverage = mean(includes_pate))
#> # A tibble: 1 x 1
#>   coverage
#>      <dbl>
#> 1    0.948

To do this for multiple levels encapsulate the above code in a function with arguments .data (the data frame) and the confidence level, level:

pate_sims_coverage <- function(.data, level = 0.95) {
  mutate(.data,
         ci_lower = diff_mean - qnorm(1 - (1 - level) / 2) * se,
         ci_upper = diff_mean + qnorm(1 - (1 - level) / 2) * se,
         includes_pate = PATE > ci_lower & PATE < ci_upper) %>%
    summarise(coverage = mean(includes_pate))
}

pate_sims_coverage(pate_sims_se, 0.95)
#> # A tibble: 1 x 1
#>   coverage
#>      <dbl>
#> 1    0.948
pate_sims_coverage(pate_sims_se, 0.99)
#> # A tibble: 1 x 1
#>   coverage
#>      <dbl>
#> 1    0.989
pate_sims_coverage(pate_sims_se, 0.90)
#> # A tibble: 1 x 1
#>   coverage
#>      <dbl>
#> 1    0.898

p <- 0.6
n <- 10
alpha <- 0.05
sims <- 5000

Define a function that samples from a Bernoulli distribution, calculates its standard error, and returns a logical value as to whether it contains the true value:

binom_ci_contains <- function(n, p, alpha = 0.05) {
  x <- rbinom(n, size = 1, prob = p)
  x_bar <- mean(x)
  se <- sqrt(x_bar * (1 - x_bar) / n)
  ci_lower <- x_bar - qnorm(1 - alpha / 2) * se
  ci_upper <- x_bar + qnorm(1 - alpha / 2) * se
  (ci_lower <= p) & (p <= ci_upper)
}

We can run this once for given sample size:

n <- 10
binom_ci_contains(n, p)
#> [1] FALSE

Using map_df we can rerun it sims times and calculate the coverage proportion:

mean(map_lgl(seq_len(sims), ~ binom_ci_contains(n, p)))
#> [1] 0.905

Encapsulate the above code in a function that calculates the coverage of a confidence interval with size, n, and success probability, p:

binom_ci_coverage <- function(n, p, sims) {
  mean(map_lgl(seq_len(sims), ~ binom_ci_contains(n, p)))
}

Use binom_ci_coverage to calculate CI coverage for multiple values of the sample size:

tibble(n = c(10L, 100L, 1000L)) %>%
  mutate(coverage = map_dbl(n, binom_ci_coverage, p = !!p, sims = !!sims))
#> # A tibble: 3 x 2
#>       n coverage
#>   <int>    <dbl>
#> 1    10    0.900
#> 2   100    0.943
#> 3  1000    0.948

7.1.4 Margin of Error and Sample Size Calculation in Polls

Write a function to calculate the sample size needed for a given proportion.

moe_pop_prop <- function(MoE) {
  tibble(p = seq(from = 0.01, to = 0.99, by = 0.01),
         n = 1.96 ^ 2 * p * (1 - p) / MoE ^ 2,
         MoE = MoE)
}
moe_pop_prop(0.01)
#> # A tibble: 99 x 3
#>       p     n   MoE
#>   <dbl> <dbl> <dbl>
#> 1  0.01  380.  0.01
#> 2  0.02  753.  0.01
#> 3  0.03 1118.  0.01
#> 4  0.04 1475.  0.01
#> 5  0.05 1825.  0.01
#> 6  0.06 2167.  0.01
#> # ... with 93 more rows

Then use map_df to call this function for different margins of error, and return the entire thing as a data frame with columns: n, p, and MoE.

MoE <- c(0.01, 0.03, 0.05)
props <- map_df(MoE, moe_pop_prop)

Since its a data frame, its easy to plot with ggplot:

ggplot(props, aes(x = p, y = n, colour = factor(MoE))) +
  geom_line() +
  labs(colour = "margin of error",
       x = "population proportion",
       y = "sample size") +
  theme(legend.position = "bottom")

read_csv already recognizes the date columns, so we don’t need to convert them. The 2008 election was on Nov 11, 2008, so we’ll store that in a variable.

ELECTION_DATE <- ymd(20081104)

Load the final vote shares,

data("pres08", package = "qss")

and polling data

data("polls08", package = "qss")

We need to add an additional column to the polls08 data frame which contains the number of days until the election:

polls08 <- polls08 %>%
  mutate(DaysToElection = as.integer(ELECTION_DATE - middate))

For each state calculate the mean of the latest polls,

poll_pred <-
  polls08 %>%
  group_by(state) %>%
  # latest polls in the state
  filter(DaysToElection == min(DaysToElection)) %>%
  # take mean of latest polls and convert from 0-100 to 0-1
  summarise(Obama = mean(Obama) / 100)
# Add confidence itervals
# sample size
sample_size <- 1000
# confidence level
alpha <- 0.05
poll_pred <-
  poll_pred %>%
  mutate(se = sqrt(Obama * (1 - Obama) / sample_size),
         ci_lwr = Obama + qnorm(alpha / 2) * se,
         ci_upr = Obama + qnorm(1 - alpha / 2) * se)
# Add actual outcome
poll_pred <-
  left_join(poll_pred,
            select(pres08, state, actual = Obama),
            by = "state") %>%
  mutate(actual = actual / 100,
         covers = (ci_lwr <= actual) & (actual <= ci_upr))
poll_pred
#> # A tibble: 51 x 7
#>   state Obama     se ci_lwr ci_upr actual covers
#>   <chr> <dbl>  <dbl>  <dbl>  <dbl>  <dbl> <lgl> 
#> 1 AK    0.39  0.0154  0.360  0.420   0.38 TRUE  
#> 2 AL    0.36  0.0152  0.330  0.390   0.39 FALSE 
#> 3 AR    0.44  0.0157  0.409  0.471   0.39 FALSE 
#> 4 AZ    0.465 0.0158  0.434  0.496   0.45 TRUE  
#> 5 CA    0.6   0.0155  0.570  0.630   0.61 TRUE  
#> 6 CO    0.52  0.0158  0.489  0.551   0.54 TRUE  
#> # ... with 45 more rows

In the plot, color the point ranges by whether they include the election day outcome.

ggplot(poll_pred, aes(x = actual, y = Obama,
                      ymin = ci_lwr, ymax = ci_upr,
                      colour = covers)) +
  geom_abline(intercept = 0, slope = 1, colour = "white", size = 2) +
  geom_pointrange() +
  scale_y_continuous("Poll prediction", limits = c(0, 1)) +
  scale_x_continuous("Obama's vote share", limits = c(0, 1)) +
  scale_colour_discrete("CI includes result?") +
  coord_fixed() +
  theme(legend.position = "bottom")

Proportion of polls with confidence intervals that include the election outcome?

poll_pred %>%
  summarise(mean(covers))
#> # A tibble: 1 x 1
#>   `mean(covers)`
#>            <dbl>
#> 1          0.588

poll_pred <-
  poll_pred %>%
  # calc bias
  mutate(bias = Obama - actual) %>%
  # bias corrected prediction, se, and CI
  mutate(Obama_bc = Obama - mean(bias),
         se_bc = sqrt(Obama_bc * (1 - Obama_bc) / sample_size),
         ci_lwr_bc = Obama_bc + qnorm(alpha / 2) * se_bc,
         ci_upr_bc = Obama_bc + qnorm(1 - alpha / 2) * se_bc,
         covers_bc = (ci_lwr_bc <= actual) & (actual <= ci_upr_bc))
poll_pred %>%
  summarise(mean(covers_bc))
#> # A tibble: 1 x 1
#>   `mean(covers_bc)`
#>               <dbl>
#> 1             0.765

7.1.5 Analysis of Randomized Controlled Trials

Load the STAR data from the qss package,

data("STAR", package = "qss")

Add meaningful labels to the classtype variable:

STAR <- STAR %>%
  mutate(classtype = factor(classtype,
                            labels = c("small class", "regular class",
                                       "regular class with aid")))

Summarize scores by classroom type:

classtype_means <-
  STAR %>%
  group_by(classtype) %>%
  summarise(g4reading = mean(g4reading, na.rm = TRUE))

Plot the distribution of scores by classroom type:

classtypes_used <- c("small class", "regular class")
ggplot(filter(STAR,
              classtype %in% classtypes_used,
              !is.na(g4reading)),
       aes(x = g4reading, y = ..density..)) +
  geom_histogram(binwidth = 20) +
  geom_vline(data = filter(classtype_means, classtype %in% classtypes_used),
             mapping = aes(xintercept = g4reading),
             colour = "white", size = 2) +
  facet_grid(classtype ~ .) +
  labs(x = "Fourth grade reading score", y = "Density")

alpha <- 0.05
star_estimates <-
  STAR %>%
  filter(!is.na(g4reading)) %>%
  group_by(classtype) %>%
  summarise(n = n(),
            est = mean(g4reading),
            se = sd(g4reading) / sqrt(n)) %>%
  mutate(lwr = est + qnorm(alpha / 2) * se,
         upr = est + qnorm(1 - alpha / 2) * se)
star_estimates
#> # A tibble: 3 x 6
#>   classtype                  n   est    se   lwr   upr
#>   <fct>                  <int> <dbl> <dbl> <dbl> <dbl>
#> 1 small class              726  723.  1.91  720.  727.
#> 2 regular class            836  720.  1.84  716.  723.
#> 3 regular class with aid   791  721.  1.86  717.  724.

star_estimates %>%
  filter(classtype %in% c("small class", "regular class")) %>%
  # ensure that it is ordered small then regular
  arrange(desc(classtype)) %>%
  summarise(
    se = sqrt(sum(se ^ 2)),
    est = diff(est)
  ) %>%
  mutate(ci_lwr = est + qnorm(alpha / 2) * se,
         ci_up = est + qnorm(1 - alpha / 2) * se)
#> # A tibble: 1 x 4
#>      se   est ci_lwr ci_up
#>   <dbl> <dbl>  <dbl> <dbl>
#> 1  2.65  3.50  -1.70  8.70

Use we could use spread and gather:

star_ate <-
  star_estimates %>%
  filter(classtype %in% c("small class", "regular class")) %>%
  mutate(classtype = fct_recode(factor(classtype),
                                "small" = "small class",
                                "regular" = "regular class")) %>%
  select(classtype, est, se) %>%
  gather(stat, value, -classtype) %>%
  unite(variable, stat, classtype)  %>%
  spread(variable, value) %>%
  mutate(ate_est = est_small - est_regular,
         ate_se = sqrt(se_small ^ 2 + se_regular ^ 2),
         ci_lwr = ate_est + qnorm(alpha / 2) * ate_se,
         ci_upr = ate_est + qnorm(1 - alpha / 2) * ate_se)
star_ate
#> # A tibble: 1 x 8
#>   est_regular est_small se_regular se_small ate_est ate_se ci_lwr ci_upr
#>         <dbl>     <dbl>      <dbl>    <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
#> 1        720.      723.       1.84     1.91    3.50   2.65  -1.70   8.70

7.1.6 Analysis Based on Student’s t-Distribution

Use filter to subset.

t.test(filter(STAR, classtype == "small class")$g4reading,
       filter(STAR, classtype == "regular class")$g4reading)
#> 
#>  Welch Two Sample t-test
#> 
#> data:  filter(STAR, classtype == "small class")$g4reading and filter(STAR, classtype == "regular class")$g4reading
#> t = 1, df = 2000, p-value = 0.2
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -1.70  8.71
#> sample estimates:
#> mean of x mean of y 
#>       723       720

The function t.test can also take a formula as its first parameter.

t.test(g4reading ~ classtype,
       data = filter(STAR, classtype %in% c("small class", "regular class")))
#> 
#>  Welch Two Sample t-test
#> 
#> data:  g4reading by classtype
#> t = 1, df = 2000, p-value = 0.2
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -1.70  8.71
#> sample estimates:
#>   mean in group small class mean in group regular class 
#>                         723                         720

7.2 Hypothesis Testing

7.2.1 Tea-Testing Experiment

# Number of cups of tea
cups <- 4
# Number guessed correctly
k <- c(0, seq_len(cups))
true <-  tibble(correct = k * 2,
                n = choose(cups, k) * choose(cups, cups - k)) %>%
  mutate(prob = n / sum(n))
true
#> # A tibble: 5 x 3
#>   correct     n   prob
#>     <dbl> <dbl>  <dbl>
#> 1       0     1 0.0143
#> 2       2    16 0.229 
#> 3       4    36 0.514 
#> 4       6    16 0.229 
#> 5       8     1 0.0143

sims <- 1000
guess <- tibble(guess = c("M", "T", "T", "M", "M", "T", "T", "M"))
randomize_tea <- function(df) {
  # randomize the order of teas
  assignment <- sample_frac(df, 1) %>%
    rename(actual = guess)
  bind_cols(df, assignment) %>%
    summarise(correct = sum(guess == actual))
}
approx <-
  map_df(seq_len(sims), ~ randomize_tea(guess)) %>%
  count(correct) %>%
  mutate(prob = n / sum(n))
left_join(select(approx, correct, prob_sim = prob),
          select(true, correct, prob_exact = prob),
          by = "correct") %>%
  mutate(diff = prob_sim - prob_exact)
#> # A tibble: 5 x 4
#>   correct prob_sim prob_exact     diff
#>     <dbl>    <dbl>      <dbl>    <dbl>
#> 1       0    0.011     0.0143 -0.00329
#> 2       2    0.234     0.229   0.00543
#> 3       4    0.486     0.514  -0.0283 
#> 4       6    0.253     0.229   0.0244 
#> 5       8    0.016     0.0143  0.00171

7.2.2 The General Framework

The test functions like fisher.test do not work particularly well with data frames, and expect vectors or matrices as input, so tidyverse functions are less directly applicable

# all guesses correct
x <- tribble(~Guess, ~Truth, ~Number,
             "Milk", "Milk", 4L,
             "Milk", "Tea", 0L,
             "Tea", "Milk", 0L,
             "Tea", "Tea", 4L)
x
#> # A tibble: 4 x 3
#>   Guess Truth Number
#>   <chr> <chr>  <int>
#> 1 Milk  Milk       4
#> 2 Milk  Tea        0
#> 3 Tea   Milk       0
#> 4 Tea   Tea        4
# 6 correct guesses
y <- x %>%
  mutate(Number = c(3L, 1L, 1L, 3L))
y
#> # A tibble: 4 x 3
#>   Guess Truth Number
#>   <chr> <chr>  <int>
#> 1 Milk  Milk       3
#> 2 Milk  Tea        1
#> 3 Tea   Milk       1
#> 4 Tea   Tea        3
# Turn into a 2x2 table for fisher.test
select(spread(x, Truth, Number), -Guess)
#> # A tibble: 2 x 2
#>    Milk   Tea
#>   <int> <int>
#> 1     4     0
#> 2     0     4
# Use spread to make it a 2 x 2 table
fisher.test(select(spread(x, Truth, Number), -Guess),
            alternative = "greater")
#> 
#>  Fisher's Exact Test for Count Data
#> 
#> data:  select(spread(x, Truth, Number), -Guess)
#> p-value = 0.01
#> alternative hypothesis: true odds ratio is greater than 1
#> 95 percent confidence interval:
#>    2 Inf
#> sample estimates:
#> odds ratio 
#>        Inf
fisher.test(select(spread(y, Truth, Number), -Guess))
#> 
#>  Fisher's Exact Test for Count Data
#> 
#> data:  select(spread(y, Truth, Number), -Guess)
#> p-value = 0.5
#> alternative hypothesis: true odds ratio is not equal to 1
#> 95 percent confidence interval:
#>    0.212 621.934
#> sample estimates:
#> odds ratio 
#>       6.41

7.2.3 One-Sample Tests

n <- 1018
x_bar <- 550 / n
se <- sqrt(0.5 * 0.5 / n) # standard deviation of sampling distribution
# upper red area in the figure
upper <- pnorm(x_bar, mean = 0.5, sd = se, lower.tail = FALSE)
# lower red area in the figure; identical to the upper area
lower <- pnorm(0.5 - (x_bar - 0.5), mean = 0.5, sd = se)
# two side p value
upper + lower
#> [1] 0.0102
2 * upper
#> [1] 0.0102
# one sided p value
upper
#> [1] 0.00508
z_score <- (x_bar - 0.5) / se
z_score
#> [1] 2.57
pnorm(z_score, lower.tail = FALSE) # one-sided p-value
#> [1] 0.00508
2 * pnorm(z_score, lower.tail = FALSE) # two-sided p-value
#> [1] 0.0102
# 99% confidence interval contains 0.5
c(x_bar - qnorm(0.995) * se, x_bar + qnorm(0.995) * se)
#> [1] 0.500 0.581
# 95% confidence interval does not contain 0.5
c(x_bar - qnorm(0.975) * se, x_bar + qnorm(0.975) * se)
#> [1] 0.510 0.571
# no continuity correction to get the same p-value as above
prop.test(550, n = n, p = 0.5, correct = FALSE)
#> 
#>  1-sample proportions test without continuity correction
#> 
#> data:  550 out of n, null probability 0.5
#> X-squared = 7, df = 1, p-value = 0.01
#> alternative hypothesis: true p is not equal to 0.5
#> 95 percent confidence interval:
#>  0.510 0.571
#> sample estimates:
#>    p 
#> 0.54
# with continuity correction
prop.test(550, n = n, p = 0.5)
#> 
#>  1-sample proportions test with continuity correction
#> 
#> data:  550 out of n, null probability 0.5
#> X-squared = 6, df = 1, p-value = 0.01
#> alternative hypothesis: true p is not equal to 0.5
#> 95 percent confidence interval:
#>  0.509 0.571
#> sample estimates:
#>    p 
#> 0.54
prop.test(550, n = n, p = 0.5, conf.level = 0.99)
#> 
#>  1-sample proportions test with continuity correction
#> 
#> data:  550 out of n, null probability 0.5
#> X-squared = 6, df = 1, p-value = 0.01
#> alternative hypothesis: true p is not equal to 0.5
#> 99 percent confidence interval:
#>  0.499 0.581
#> sample estimates:
#>    p 
#> 0.54

# two-sided one-sample t-test
t.test(STAR$g4reading, mu = 710)
#> 
#>  One Sample t-test
#> 
#> data:  STAR$g4reading
#> t = 10, df = 2000, p-value <2e-16
#> alternative hypothesis: true mean is not equal to 710
#> 95 percent confidence interval:
#>  719 723
#> sample estimates:
#> mean of x 
#>       721

7.2.4 Two-sample tests

The ATE estimates are stored in a data frame, star_ate. Note that the dplyr function transmute is like mutate, but only returns the variables specified in the function.

star_ate %>%
  transmute(p_value_1sided = pnorm(-abs(ate_est),
                                   mean = 0, sd = ate_se),
            p_value_2sided = 2 * pnorm(-abs(ate_est), mean = 0,
                                   sd = ate_se))
#> # A tibble: 1 x 2
#>   p_value_1sided p_value_2sided
#>            <dbl>          <dbl>
#> 1         0.0935          0.187

t.test(g4reading ~ classtype,
  data = filter(STAR, classtype %in% c("small class", "regular class")))
#> 
#>  Welch Two Sample t-test
#> 
#> data:  g4reading by classtype
#> t = 1, df = 2000, p-value = 0.2
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -1.70  8.71
#> sample estimates:
#>   mean in group small class mean in group regular class 
#>                         723                         720

t.test(filter(STAR, classtype == "small class")$g4reading,
       filter(STAR, classtype == "regular class")$g4reading)
#> 
#>  Welch Two Sample t-test
#> 
#> data:  filter(STAR, classtype == "small class")$g4reading and filter(STAR, classtype == "regular class")$g4reading
#> t = 1, df = 2000, p-value = 0.2
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -1.70  8.71
#> sample estimates:
#> mean of x mean of y 
#>       723       720

data("resume", package = "qss")
x <- resume %>%
  count(race, call) %>%
  spread(call, n) %>%
  ungroup()
x
#> # A tibble: 2 x 3
#>   race    `0`   `1`
#>   <chr> <int> <int>
#> 1 black  2278   157
#> 2 white  2200   235
prop.test(as.matrix(select(x, -race)), alternative = "greater")
#> 
#>  2-sample test for equality of proportions with continuity
#>  correction
#> 
#> data:  as.matrix(select(x, -race))
#> X-squared = 20, df = 1, p-value = 2e-05
#> alternative hypothesis: greater
#> 95 percent confidence interval:
#>  0.0188 1.0000
#> sample estimates:
#> prop 1 prop 2 
#>  0.936  0.903

Assign sample sizes and proportions, then calculate point estimates, standard error, z-statistic and one-sided p-value.

n0 <- sum(resume$race == "black")
n1 <- sum(resume$race == "white")

p <- mean(resume$call)
p0 <- mean(filter(resume, race == "black")$call)
p1 <- mean(filter(resume, race == "white")$call)

est <- p1 - p0
est
#> [1] 0.032

se <- sqrt(p * (1 - p) * (1 / n0 + 1 / n1))
se
#> [1] 0.0078

zstat <- est / se
zstat
#> [1] 4.11

pnorm(-abs(zstat))
#> [1] 1.99e-05

The only thing that changed is using filter for selecting the groups.

7.2.5 Power Analysis

Set the parameters: the sample size,

n <- 250

the population proportional under the alternative data generating process,

p_star <- 0.48

the null hypothesis,

p <- 0.5

the p-value,

alpha <- 0.05

and

cr_value <- qnorm(1 - alpha / 2)

The standard errors under the hypothetical data generating process is

se_star <- sqrt(p_star * (1 - p_star) / n)

and the standard error under the null is

se <- sqrt(p * (1 - p) / n)

The power for this test is

pnorm(p - cr_value * se, mean = p_star, sd = se_star) +
    pnorm(p + cr_value * se, mean = p_star, sd = se_star,
          lower.tail = FALSE)
#> [1] 0.0967

The parameters (sample sizes and proportions) are

n1 <- 500
n0 <- 500
p1_star <- 0.05
p0_star <- 0.1

Calculate the overall call back rate as a weighted average,

p <- (n1 * p1_star + n0 * p0_star) / (n1 + n0)

the standard error under the null,

se <- sqrt(p * (1 - p) * (1 / n1 + 1 / n0))

the standard error under the hypothetical data generating process,

se.star <- sqrt(p1_star * (1 - p1_star) / n1 + p0_star * (1 - p0_star) / n0)

pnorm(-cr_value * se, mean = p1_star - p0_star, sd = se.star) +
    pnorm(cr_value * se, mean = p1_star - p0_star, sd = se.star,
          lower.tail = FALSE)
#> [1] 0.852

power.prop.test(n = 500, p1 = 0.05, p2 = 0.1, sig.level = 0.05)
#> 
#>      Two-sample comparison of proportions power calculation 
#> 
#>               n = 500
#>              p1 = 0.05
#>              p2 = 0.1
#>       sig.level = 0.05
#>           power = 0.852
#>     alternative = two.sided
#> 
#> NOTE: n is number in *each* group

power.prop.test(p1 = 0.05, p2 = 0.1, sig.level = 0.05, power = 0.9)
#> 
#>      Two-sample comparison of proportions power calculation 
#> 
#>               n = 581
#>              p1 = 0.05
#>              p2 = 0.1
#>       sig.level = 0.05
#>           power = 0.9
#>     alternative = two.sided
#> 
#> NOTE: n is number in *each* group

power.t.test(n = 100, delta = 0.25, sd = 1, type = "one.sample")
#> 
#>      One-sample t test power calculation 
#> 
#>               n = 100
#>           delta = 0.25
#>              sd = 1
#>       sig.level = 0.05
#>           power = 0.697
#>     alternative = two.sided

power.t.test(power = 0.9, delta = 0.25, sd = 1, type = "one.sample")
#> 
#>      One-sample t test power calculation 
#> 
#>               n = 170
#>           delta = 0.25
#>              sd = 1
#>       sig.level = 0.05
#>           power = 0.9
#>     alternative = two.sided

power.t.test(delta = 0.25, sd = 1, type = "two.sample",
             alternative = "one.sided", power = 0.9)
#> 
#>      Two-sample t test power calculation 
#> 
#>               n = 275
#>           delta = 0.25
#>              sd = 1
#>       sig.level = 0.05
#>           power = 0.9
#>     alternative = one.sided
#> 
#> NOTE: n is number in *each* group

7.3 Linear Regression Model with Uncertainty

7.3.1 Linear Regression as a Generative Model

Load the minimum wage date included with the qss package:

data("minwage", package = "qss")

minwage <- mutate(minwage,
                  fullPropBefore = fullBefore / (fullBefore + partBefore),
                  fullPropAfter = fullAfter / (fullAfter + partAfter),
                  NJ = as.integer(location == "PA"))

fit_minwage <- lm(fullPropAfter ~ -1 + NJ + fullPropBefore +
                    wageBefore + chain, data = minwage)
fit_minwage
#> 
#> Call:
#> lm(formula = fullPropAfter ~ -1 + NJ + fullPropBefore + wageBefore + 
#>     chain, data = minwage)
#> 
#> Coefficients:
#>              NJ   fullPropBefore       wageBefore  chainburgerking  
#>         -0.0542           0.1688           0.0813          -0.0614  
#>        chainkfc        chainroys      chainwendys  
#>         -0.0966          -0.1522          -0.1659
fit_minwage1 <- lm(fullPropAfter ~ NJ + fullPropBefore +
                     wageBefore + chain, data = minwage)
fit_minwage1
#> 
#> Call:
#> lm(formula = fullPropAfter ~ NJ + fullPropBefore + wageBefore + 
#>     chain, data = minwage)
#> 
#> Coefficients:
#>    (Intercept)              NJ  fullPropBefore      wageBefore  
#>        -0.0614         -0.0542          0.1688          0.0813  
#>       chainkfc       chainroys     chainwendys  
#>        -0.0352         -0.0908         -0.1045
gather_predictions(slice(minwage, 1), fit_minwage, fit_minwage1) %>%
  select(model, pred)
#>          model  pred
#> 1  fit_minwage 0.271
#> 2 fit_minwage1 0.271

7.3.2 Inference about coefficients

Use the tidy function to return the coefficients, including confidence intervals, as a data frame:

data("women", package = "qss")
fit_women <- lm(water ~ reserved, data = women)
summary(fit_women)
#> 
#> Call:
#> lm(formula = water ~ reserved, data = women)
#> 
#> Residuals:
#>    Min     1Q Median     3Q    Max 
#> -23.99 -14.74  -7.86   2.26 316.01 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)    14.74       2.29    6.45  4.2e-10 ***
#> reserved        9.25       3.95    2.34     0.02 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 33.4 on 320 degrees of freedom
#> Multiple R-squared:  0.0169, Adjusted R-squared:  0.0138 
#> F-statistic: 5.49 on 1 and 320 DF,  p-value: 0.0197
tidy(fit_women)
#>          term estimate std.error statistic  p.value
#> 1 (Intercept)    14.74      2.29      6.45 4.22e-10
#> 2    reserved     9.25      3.95      2.34 1.97e-02

You need to set conf.int = TRUE for tidy to include the confidence interval:

summary(fit_minwage)
#> 
#> Call:
#> lm(formula = fullPropAfter ~ -1 + NJ + fullPropBefore + wageBefore + 
#>     chain, data = minwage)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -0.4862 -0.1813 -0.0281  0.1513  0.7509 
#> 
#> Coefficients:
#>                 Estimate Std. Error t value Pr(>|t|)   
#> NJ               -0.0542     0.0332   -1.63   0.1034   
#> fullPropBefore    0.1688     0.0566    2.98   0.0031 **
#> wageBefore        0.0813     0.0389    2.09   0.0374 * 
#> chainburgerking  -0.0614     0.1755   -0.35   0.7266   
#> chainkfc         -0.0966     0.1793   -0.54   0.5904   
#> chainroys        -0.1522     0.1832   -0.83   0.4066   
#> chainwendys      -0.1659     0.1853   -0.90   0.3711   
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.244 on 351 degrees of freedom
#> Multiple R-squared:  0.635,  Adjusted R-squared:  0.628 
#> F-statistic: 87.2 on 7 and 351 DF,  p-value: <2e-16
tidy(fit_minwage, conf.int = TRUE)
#>              term estimate std.error statistic p.value conf.low conf.high
#> 1              NJ  -0.0542    0.0332    -1.633 0.10343 -0.11953    0.0111
#> 2  fullPropBefore   0.1688    0.0566     2.981 0.00307  0.05744    0.2801
#> 3      wageBefore   0.0813    0.0389     2.090 0.03737  0.00478    0.1579
#> 4 chainburgerking  -0.0614    0.1755    -0.350 0.72657 -0.40648    0.2837
#> 5        chainkfc  -0.0966    0.1793    -0.539 0.59044 -0.44919    0.2560
#> 6       chainroys  -0.1522    0.1832    -0.831 0.40664 -0.51238    0.2080
#> 7     chainwendys  -0.1659    0.1853    -0.895 0.37115 -0.53031    0.1985

7.3.3 Inference about predictions

data("MPs", package = "qss")
MPs_labour <- filter(MPs, party == "labour")
MPs_tory <- filter(MPs, party == "tory")
labour_fit1 <- lm(ln.net ~ margin, data = filter(MPs_labour, margin < 0))
labour_fit2 <- lm(ln.net ~ margin, data = filter(MPs_labour, margin > 0))
tory_fit1 <- lm(ln.net ~ margin, data = filter(MPs_tory, margin < 0))
tory_fit2 <- lm(ln.net ~ margin, data = filter(MPs_tory, margin > 0))

Predictions at the threshold. The broom function augment will return prediction fitted values and standard errors for each value, but not the confidence intervals themselves (we’d have to multiply the correct t-distribution with degrees of freedom.) So instead, we’ll directly use the predict function:

tory_y0 <-
  predict(tory_fit1, interval = "confidence",
          newdata = tibble(margin = 0)) %>% as_tibble()
tory_y0
#> # A tibble: 1 x 3
#>     fit   lwr   upr
#>   <dbl> <dbl> <dbl>
#> 1  12.5  12.1  13.0
tory_y1 <-
  predict(tory_fit2, interval = "confidence",
          newdata = tibble(margin = 0)) %>% as_tibble()
tory_y1
#> # A tibble: 1 x 3
#>     fit   lwr   upr
#>   <dbl> <dbl> <dbl>
#> 1  13.2  12.8  13.6

Alternatively, using augment (and assuming a normal distribution since the number of observations is so large its not worth worrying about the t-distribution):

tory_y0 <-
  augment(tory_fit1, newdata = tibble(margin = 0)) %>%
  mutate(lwr = .fitted + qnorm(0.025) * .se.fit,
         upr = .fitted + qnorm(0.975) * .se.fit)
tory_y0
#>   margin .fitted .se.fit  lwr upr
#> 1      0    12.5   0.214 12.1  13
tory_y1 <-
  augment(tory_fit2, newdata = tibble(margin = 0)) %>%
  mutate(lwr = .fitted + qnorm(0.025) * .se.fit,
         upr = .fitted + qnorm(0.975) * .se.fit)
tory_y1
#>   margin .fitted .se.fit  lwr  upr
#> 1      0    13.2   0.192 12.8 13.6

y1_range <- data_grid(filter(MPs_tory, margin <= 0), margin)
tory_y0 <- augment(tory_fit1, newdata = y1_range)
y2_range <- data_grid(filter(MPs_tory, margin >= 0), margin)
tory_y1 <- augment(tory_fit2, newdata = y2_range)

ggplot() +
  geom_ref_line(v = 0) +
  geom_point(aes(y = ln.net, x = margin), data = MPs_tory) +
  # plot losers
  geom_ribbon(aes(x = margin,
                  ymin = .fitted + qnorm(0.025) * .se.fit,
                  ymax = .fitted + qnorm(0.975) * .se.fit),
              data = tory_y0, alpha = 0.3) +
  geom_line(aes(x = margin, y = .fitted), data = tory_y0) +
  # plot winners
  geom_ribbon(aes(x = margin,
                  ymin = .fitted + qnorm(0.025) * .se.fit,
                  ymax = .fitted + qnorm(0.975) * .se.fit),
              data = tory_y1, alpha = 0.3) +
  geom_line(aes(x = margin, y = .fitted), data = tory_y1) +
  labs(x = "Margin of vitory", y = "log net wealth")

tory_y1 <- augment(tory_fit1, newdata = tibble(margin = 0))
tory_y1
#>   margin .fitted .se.fit
#> 1      0    12.5   0.214
tory_y0 <- augment(tory_fit2, newdata = tibble(margin = 0))
tory_y0
#>   margin .fitted .se.fit
#> 1      0    13.2   0.192
summary(tory_fit1)
#> 
#> Call:
#> lm(formula = ln.net ~ margin, data = filter(MPs_tory, margin < 
#>     0))
#> 
#> Residuals:
#>    Min     1Q Median     3Q    Max 
#> -5.320 -0.472 -0.035  0.663  3.580 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)   12.538      0.214   58.54   <2e-16 ***
#> margin         1.491      1.291    1.15     0.25    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 1.43 on 119 degrees of freedom
#> Multiple R-squared:  0.0111, Adjusted R-squared:  0.00277 
#> F-statistic: 1.33 on 1 and 119 DF,  p-value: 0.251
summary(tory_fit2)
#> 
#> Call:
#> lm(formula = ln.net ~ margin, data = filter(MPs_tory, margin > 
#>     0))
#> 
#> Residuals:
#>    Min     1Q Median     3Q    Max 
#> -3.858 -0.877  0.001  0.830  3.126 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)   13.188      0.192   68.69   <2e-16 ***
#> margin        -0.728      1.982   -0.37     0.71    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 1.29 on 100 degrees of freedom
#> Multiple R-squared:  0.00135,    Adjusted R-squared:  -0.00864 
#> F-statistic: 0.135 on 1 and 100 DF,  p-value: 0.714

Since we aren’t doing anything more with these values, there isn’t much benefit in keeping them in data frames.

# standard error
se_diff <- sqrt(tory_y0$.se.fit ^ 2 + tory_y1$.se.fit ^ 2)
se_diff
#> [1] 0.288
# point estimate
diff_est <- tory_y1$.fitted - tory_y0$.fitted
diff_est
#> [1] -0.65
# confidence interval
CI <- c(diff_est - se_diff * qnorm(0.975),
        diff_est + se_diff * qnorm(0.975))
CI
#> [1] -1.2134 -0.0859
# hypothesis test
z_score <- diff_est / se_diff
# two sided p value
p_value <- 2 * pnorm(abs(z_score), lower.tail = FALSE)
p_value
#> [1] 0.0239