6 Probability
Prerequisites
library("tidyverse")
library("forcats")
library("stringr")
library("broom")
6.1 Probability
6.1.1 Frequentist vs. Bayesian
6.1.2 Definition and Axioms
6.1.3 Permutations
birthday <- function(k) {
logdenom <- k * log(365) + lfactorial(365 - k)
lognumer <- lfactorial(365)
pr <- 1 - exp(lognumer - logdenom)
pr
}
bday <- tibble(k = 1:50, pr = birthday(k))
ggplot(bday, aes(x = k, y = pr)) +
geom_hline(yintercept = 0.5, colour = "white", size = 2) +
geom_line() +
geom_point() +
scale_y_continuous(str_c("Probability that at least two",
"people have the same birthday", sep = "\n"),
limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +
labs(x = "Number of people")
Note: The logarithm is used for numerical stability. Basically, “floating-point” numbers are approximations of numbers. If you perform arithmetic with numbers that are very large, very small, or vary differently in magnitudes, you could have problems. Logarithms help with some of those issues. See “Falling Into the Floating Point Trap” in The R Inferno for a summary of floating point numbers. See these John Fox posts 1 2 for an example of numerical stability gone wrong. Also see this article on the log sum of exponentials.
6.1.4 Sampling without replacement
Instead of using a for
loop, we could do the simulations using a functional as described in R for Data Science chapter “Iterations”.
Define the function sim_bdays
which randomly samples k
birthdays, and returns TRUE
if there are any duplicate birthdays, and FALSE
if there are none.
sim_bdays <- function(k) {
days <- sample(1:365, k, replace = TRUE)
length(unique(days)) < k
}
We can test the code for k = 10
birthdays.
sim_bdays(10)
#> [1] FALSE
Since the function is randomly sampling birthdays, running it multiple times will produce different answers.
One helpful feature of a functional style of writing code vs. a for
loop is that the function encapsulates the code and allows you to test that it works for different inputs before repeating it for many inputs. It is more difficult to debug functions that produce random outputs, but some sanity checks are that the function:
- returns a logical vector of length one (
TRUE
orFALSE
) - always returns
FALSE
whenk = 1
since there can never be a duplicates with one person - always returns
TRUE
whenk > 365
by the pidgeonhole principle.
sim_bdays(1)
#> [1] FALSE
sim_bdays(366)
#> [1] TRUE
Set the parameters for 1,000 simulations, and 23 individuals. We use map_lgl
since sim_bdays
returns a logical value (TRUE
, FALSE
):
sims <- 1000
k <- 23
map_lgl(seq_len(sims), ~ sim_bdays(k)) %>%
mean()
#> [1] 0.489
An alternative way of running this is using the rerun and using flatten
to turn the output to a numeric vector:
rerun(sims, sim_bdays(k)) %>%
flatten_dbl() %>%
mean()
#> [1] 0.477
6.1.5 Combinations
The function for \(84\choose{6}\) is:
choose(84, 6)
#> [1] 4.06e+08
However, due to the the larges values that the binomial coefficient, it is almost always better to use the log of the binomial coefficient, \(\log{84\choose{6}}\),
lchoose(84, 6)
#> [1] 19.8
6.2 Conditional Probability
6.2.1 Conditional, Marginal, and Joint Probabilities
Load Florida voting data from the qss package:
data(FLVoters, package = "qss")
dim(FLVoters)
#> [1] 10000 6
glimpse(FLVoters)
#> Observations: 10,000
#> Variables: 6
#> $ surname <chr> "PIEDRA", "LYNCH", "CHESTER", "LATHROP", "HUMMEL", "CH...
#> $ county <int> 115, 115, 115, 115, 115, 115, 115, 115, 1, 1, 115, 115...
#> $ VTD <int> 66, 13, 103, 80, 8, 55, 84, 48, 41, 39, 26, 45, 11, 48...
#> $ age <int> 58, 51, 63, 54, 77, 49, 77, 34, 56, 60, 44, 45, 80, 83...
#> $ gender <chr> "f", "m", "m", "m", "f", "m", "f", "f", "f", "m", "m",...
#> $ race <chr> "white", "white", NA, "white", "white", "white", "whit...
FLVoters <- FLVoters %>%
na.omit()
dim(FLVoters)
#> [1] 9113 6
Note the difference between glimpse()
and dim()
. What is different in how they handle NA observations?
Instead of using prop.base, we calculate the probabilities with a data frame. Calculate the marginal probabilities of each race:
margin_race <-
FLVoters %>%
count(race) %>%
mutate(prop = n / sum(n))
margin_race
#> # A tibble: 6 x 3
#> race n prop
#> <chr> <int> <dbl>
#> 1 asian 175 0.0192
#> 2 black 1194 0.131
#> 3 hispanic 1192 0.131
#> 4 native 29 0.00318
#> 5 other 310 0.0340
#> 6 white 6213 0.682
Calculate the marginal probabilities of each gender:
margin_gender <-
FLVoters %>%
count(gender) %>%
mutate(prop = n / sum(n))
margin_gender
#> # A tibble: 2 x 3
#> gender n prop
#> <chr> <int> <dbl>
#> 1 f 4883 0.536
#> 2 m 4230 0.464
FLVoters %>%
filter(gender == "f") %>%
count(race) %>%
mutate(prop = n / sum(n))
#> # A tibble: 6 x 3
#> race n prop
#> <chr> <int> <dbl>
#> 1 asian 83 0.0170
#> 2 black 678 0.139
#> 3 hispanic 666 0.136
#> 4 native 17 0.00348
#> 5 other 158 0.0324
#> 6 white 3281 0.672
joint_p <-
FLVoters %>%
count(gender, race) %>%
mutate(prop = n / sum(n))
joint_p
#> # A tibble: 12 x 4
#> gender race n prop
#> <chr> <chr> <int> <dbl>
#> 1 f asian 83 0.00911
#> 2 f black 678 0.0744
#> 3 f hispanic 666 0.0731
#> 4 f native 17 0.00187
#> 5 f other 158 0.0173
#> 6 f white 3281 0.360
#> # ... with 6 more rows
We can convert the data frame to have gender as columns:
joint_p %>%
select(-n) %>%
spread(gender, prop)
#> # A tibble: 6 x 3
#> race f m
#> <chr> <dbl> <dbl>
#> 1 asian 0.00911 0.0101
#> 2 black 0.0744 0.0566
#> 3 hispanic 0.0731 0.0577
#> 4 native 0.00187 0.00132
#> 5 other 0.0173 0.0167
#> 6 white 0.360 0.322
Sum over race:
joint_p %>%
group_by(race) %>%
summarise(prop = sum(prop))
#> # A tibble: 6 x 2
#> race prop
#> <chr> <dbl>
#> 1 asian 0.0192
#> 2 black 0.131
#> 3 hispanic 0.131
#> 4 native 0.00318
#> 5 other 0.0340
#> 6 white 0.682
Sum over gender:
joint_p %>%
group_by(gender) %>%
summarise(prop = sum(prop))
#> # A tibble: 2 x 2
#> gender prop
#> <chr> <dbl>
#> 1 f 0.536
#> 2 m 0.464
FLVoters <-
FLVoters %>%
mutate(age_group = cut(age, c(0, 20, 40, 60, Inf), right = TRUE,
labels = c("<= 20", "20-40", "40-60", "> 60")))
joint3 <-
FLVoters %>%
count(race, age_group, gender) %>%
ungroup() %>%
mutate(prop = n / sum(n))
joint3
#> # A tibble: 47 x 5
#> race age_group gender n prop
#> <chr> <fct> <chr> <int> <dbl>
#> 1 asian <= 20 f 1 0.000110
#> 2 asian <= 20 m 2 0.000219
#> 3 asian 20-40 f 24 0.00263
#> 4 asian 20-40 m 26 0.00285
#> 5 asian 40-60 f 38 0.00417
#> 6 asian 40-60 m 47 0.00516
#> # ... with 41 more rows
Marginal probabilities by age groups
margin_age <-
FLVoters %>%
count(age_group) %>%
mutate(prop = n / sum(n))
margin_age
#> # A tibble: 4 x 3
#> age_group n prop
#> <fct> <int> <dbl>
#> 1 <= 20 161 0.0177
#> 2 20-40 2469 0.271
#> 3 40-60 3285 0.360
#> 4 > 60 3198 0.351
Calculate the probabilities that each group is in a given age group, and show \(P(\text{black} \land \text{female} \land \text{age} > 60)\): (Note: the symbol \(\land\) is the logical symbol for ‘and’, implying the joint probability.)
left_join(joint3,
select(margin_age, age_group, margin_age = prop),
by = "age_group") %>%
mutate(prob_age_group = prop / margin_age) %>%
filter(race == "black", gender == "f", age_group == "> 60") %>%
select(race, age_group, gender, prob_age_group)
#> # A tibble: 1 x 4
#> race age_group gender prob_age_group
#> <chr> <fct> <chr> <dbl>
#> 1 black > 60 f 0.0538
Two-way joint probability table for age group and gender
joint2 <- FLVoters %>%
count(age_group, gender) %>%
ungroup() %>%
mutate(prob_age_gender = n / sum(n))
joint2
#> # A tibble: 8 x 4
#> age_group gender n prob_age_gender
#> <fct> <chr> <int> <dbl>
#> 1 <= 20 f 88 0.00966
#> 2 <= 20 m 73 0.00801
#> 3 20-40 f 1304 0.143
#> 4 20-40 m 1165 0.128
#> 5 40-60 f 1730 0.190
#> 6 40-60 m 1555 0.171
#> # ... with 2 more rows
The joint probability \(P(\text{age} > 60 \land \text{female})\),
joint2 %>%
filter(age_group == "> 60", gender == "f")
#> # A tibble: 1 x 4
#> age_group gender n prob_age_gender
#> <fct> <chr> <int> <dbl>
#> 1 > 60 f 1761 0.193
The conditional probabilities \(P(\text{race } | \text{ gender, age})\),
condprob_race <-
left_join(joint3, select(joint2, -n), by = c("age_group", "gender")) %>%
mutate(prob_race = prop / prob_age_gender) %>%
arrange(age_group, gender) %>%
select(age_group, gender, race, prob_race)
Each row is the \(P(\text{race } | \text{ age group} \land \text{gender})\), so \(P(\text{black } | \text{ female} \land \text{age} > 60)\),
filter(condprob_race, gender == "f", age_group == "> 60", race == "black")
#> # A tibble: 1 x 4
#> age_group gender race prob_race
#> <fct> <chr> <chr> <dbl>
#> 1 > 60 f black 0.0977
6.2.2 Independence
Create a table with the products of margins of race and age. Using the function crossing to create a tibble with all combinations of race and gender and the independent prob.
race_gender_indep <-
crossing(select(margin_race, race, prob_race = prop),
select(margin_gender, gender, prob_gender = prop)) %>%
mutate(prob_indep = prob_race * prob_gender) %>%
left_join(select(joint_p, gender, race, prob = prop),
by = c("gender", "race")) %>%
select(race, gender, everything())
race_gender_indep
#> # A tibble: 12 x 6
#> race gender prob_race prob_gender prob_indep prob
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 asian f 0.0192 0.536 0.0103 0.00911
#> 2 asian m 0.0192 0.464 0.00891 0.0101
#> 3 black f 0.131 0.536 0.0702 0.0744
#> 4 black m 0.131 0.464 0.0608 0.0566
#> 5 hispanic f 0.131 0.536 0.0701 0.0731
#> 6 hispanic m 0.131 0.464 0.0607 0.0577
#> # ... with 6 more rows
ggplot(race_gender_indep,
aes(x = prob_indep, y = prob, colour = race)) +
geom_abline(intercept = 0, slope = 1, colour = "white", size = 2) +
geom_point() +
facet_grid(. ~ gender) +
coord_fixed() +
theme(legend.position = "bottom") +
labs(x = expression(P("race") * P("gender")),
y = expression(P("race and gender")))
While the original code only calculates joint-independence value for values of age > 60, and female, this calculates the joint probabilities for all combinations of the three variables, and facets by age and gender.
joint_indep <-
crossing(select(margin_race, race, prob_race = prop),
select(margin_age, age_group, prob_age = prop),
select(margin_gender, gender, prob_gender = prop)) %>%
mutate(indep_prob = prob_race * prob_age * prob_gender) %>%
left_join(select(joint3, race, age_group, gender, prob = prop),
by = c("gender", "age_group", "race")) %>%
replace_na(list(prob = 0))
ggplot(joint_indep, aes(x = prob, y = indep_prob, colour = race)) +
geom_abline(intercept = 0, slope = 1, colour = "white", size = 2) +
geom_point() +
facet_grid(age_group ~ gender) +
coord_fixed() +
labs(x = "P(race and age and gender)",
y = "P(race) * P(age) * P(gender)",
title = "Joint Independence")
While code in QSS only calculates the conditional independence given female, the following code calculates conditional independence for all values of gender
:
cond_gender <-
left_join(select(joint3, race, age_group, gender, joint_prob = prop),
select(margin_gender, gender, prob_gender = prop),
by = c("gender")) %>%
mutate(cond_prob = joint_prob / prob_gender)
Calculate the conditional distribution \(\Pr(\text{race} | \text{gender})\):
prob_race_gender <-
left_join(select(joint_p, race, gender, prob_race_gender = prop),
select(margin_gender, gender, prob_gender = prop),
by = "gender") %>%
mutate(prob_race = prob_race_gender / prob_gender)
Calculate the conditional distribution \(\Pr(\text{age} | \text{gender})\):
prob_age_gender <-
left_join(select(joint2, age_group, gender, prob_age_gender),
select(margin_gender, gender, prob_gender = prop),
by = "gender") %>%
mutate(prob_age = prob_age_gender / prob_gender)
# indep prob of race and age
indep_cond_gender <-
full_join(select(prob_race_gender, race, gender, prob_race),
select(prob_age_gender, age_group, gender, prob_age),
by = "gender") %>%
mutate(indep_prob = prob_race * prob_age)
inner_join(select(indep_cond_gender, race, age_group, gender, indep_prob),
select(cond_gender, race, age_group, gender, cond_prob),
by = c("gender", "age_group", "race")) %>%
ggplot(aes(x = cond_prob, y = indep_prob, colour = race)) +
geom_abline(intercept = 0, slope = 1, colour = "white", size = 2) +
geom_point() +
facet_grid(age_group ~ gender) +
coord_fixed() +
labs(x = "P(race and age | gender)",
y = "P(race | gender) * P(age | gender)",
title = "Marginal independence")
6.2.2.1 Monty-hall problem
The for
loop approach in QSS is valid code, but here we provide a more functional approach to solving the problem. We will define a function to choose a door, repeat the function multiple times while storing the results in a data frame, and then summarize that data frame.
First, create a function for a single iteration. This returns a single logical value:
choose_door <- function(.iter) {
# what's behind each door door:
# Why is it okay that this is fixed?
doors <- c("goat", "goat", "car")
# User randomly chooses a door
first <- sample(1:3, 1)
# randomly choose the door that Monty Hall reveals
remain <- doors[-first]
monty <- sample( (1:2)[remain == "goat"], size = 1)
# did the contestant win?
tibble(.iter = .iter,
result = remain[-monty] == "car")
}
Now use map_df to run choose_door
multiple times, and then summarize the results:
sims <- 1000
map_df(seq_len(sims), choose_door) %>%
summarise(win_pct = mean(result))
#> # A tibble: 1 x 1
#> win_pct
#> <dbl>
#> 1 0.648
6.2.3 Bayes’ Rule
6.2.4 Predicting Race Using Surname and Residence Location
Start with the Census names files:
data("cnames", package = "qss")
glimpse(cnames)
#> Observations: 151,671
#> Variables: 7
#> $ surname <chr> "SMITH", "JOHNSON", "WILLIAMS", "BROWN", "JONES", ...
#> $ count <int> 2376206, 1857160, 1534042, 1380145, 1362755, 11278...
#> $ pctwhite <dbl> 73.34, 61.55, 48.52, 60.72, 57.69, 85.80, 64.73, 6...
#> $ pctblack <dbl> 22.22, 33.80, 46.72, 34.54, 37.73, 10.41, 30.77, 0...
#> $ pctapi <dbl> 0.40, 0.42, 0.37, 0.41, 0.35, 0.42, 0.40, 1.43, 0....
#> $ pcthispanic <dbl> 1.56, 1.50, 1.60, 1.64, 1.44, 1.43, 1.58, 90.82, 9...
#> $ pctothers <dbl> 2.48, 2.73, 2.79, 2.69, 2.79, 1.94, 2.52, 1.09, 0....
For each surname, cnames
contains variables with the probability that it belongs to an individual of a given race (pctwhite
, pctblack
, …). We want to find the most-likely race for a given surname, by finding the race with the maximum proportion. Instead of dealing with multiple variables, it is easier to use max
on a single variable, so we will rearrange the data to in order to use a grouped summarize, and then merge the new variable back to the original data sets. We will also remove the ‘pct’ prefix from each of the variable names.
Calculate the most likely race for each name:
most_likely_race <-
cnames %>%
select(-count) %>%
gather(race_pred, pct, -surname) %>%
# remove pct_ prefix from variable names
mutate(race_pred = str_replace(race_pred, "^pct", "")) %>%
# # group by surname
group_by(surname) %>%
# select obs with the largest percentage
filter(row_number(desc(pct)) == 1L) %>%
# Ungroup to avoid errors later
ungroup %>%
# # don't need pct anymore
select(-pct) %>%
mutate(race_pred = recode(race_pred, asian = "api", other = "others"))
Merge the data frame with the most likely race for each surname to the the original cnames
data frame:
cnames <-
cnames %>%
left_join(most_likely_race, by = "surname")
Instead of using match
, use inner_join
to merge the surnames to FLVoters
:
FLVoters <-
FLVoters %>%
inner_join(cnames, by = "surname")
dim(FLVoters)
#> [1] 8022 14
FLVoters
also includes a “native” category that the surname dataset does not.
FLVoters <-
FLVoters %>%
mutate(race2 = fct_recode(race, other = "native"))
Check that the levels of race
and race_pred
are the same:
FLVoters %>%
count(race2)
#> # A tibble: 5 x 2
#> race2 n
#> <fct> <int>
#> 1 asian 140
#> 2 black 1078
#> 3 hispanic 1023
#> 4 other 277
#> 5 white 5504
FLVoters %>%
count(race_pred)
#> # A tibble: 5 x 2
#> race_pred n
#> <chr> <int>
#> 1 api 120
#> 2 black 258
#> 3 hispanic 1121
#> 4 others 7
#> 5 white 6516
Now we can calculate True positive rate for all races
FLVoters %>%
group_by(race2) %>%
summarise(tp = mean(race2 == race_pred)) %>%
arrange(desc(tp))
#> # A tibble: 5 x 2
#> race2 tp
#> <fct> <dbl>
#> 1 white 0.950
#> 2 hispanic 0.847
#> 3 black 0.160
#> 4 asian 0
#> 5 other 0
and the False discovery rate for all races,
FLVoters %>%
group_by(race_pred) %>%
summarise(fp = mean(race2 != race_pred)) %>%
arrange(desc(fp))
#> # A tibble: 5 x 2
#> race_pred fp
#> <chr> <dbl>
#> 1 api 1
#> 2 others 1
#> 3 black 0.329
#> 4 hispanic 0.227
#> 5 white 0.197
Now add residence data using the FLCensus
data included in the
data("FLCensus", package = "qss")
\(P(\text{race})\) in Florida:
race.prop <-
FLCensus %>%
select(total.pop, white, black, api, hispanic, others) %>%
gather(race, pct, -total.pop) %>%
group_by(race) %>%
summarise(mean = weighted.mean(pct, weights = total.pop)) %>%
arrange(desc(mean))
race.prop
#> # A tibble: 5 x 2
#> race mean
#> <chr> <dbl>
#> 1 white 0.605
#> 2 hispanic 0.213
#> 3 black 0.139
#> 4 api 0.0219
#> 5 others 0.0214
6.2.5 Predicting Election Outcomes with Uncertainty
Load the pres08
data from the qss package.
data("pres08", package = "qss")
Add a column p
which contains Obama’s vote share of the major parties:
pres08 <- pres08 %>%
mutate(p = Obama / (Obama + McCain))
Write a function to simulate the elections. df
is the data frame (pres08
) with the state, EV, and p
columns. n_draws
is the size of the binomial distribution to draw from. .id
is the simulation number.
sim_election <- function(.id, df, n_draws = 1000) {
# For each state randomly sample
mutate(df,
draws = rbinom(n(), n_draws, p)) %>%
filter(draws > (n_draws / 2)) %>%
summarise(EV = sum(EV),
.id = .id)
}
Now simulate the election 10,000 times:
sims <- 10000
sim_results <- map_df(seq_len(sims), ~ sim_election(.x, pres08, n_draws = 1000))
In the 2008 election, Obama received 364 electoral votes
ELECTION_EV <- 364
And plot them,
ggplot(sim_results, aes(x = EV, y = ..density..)) +
geom_histogram(binwidth = 10, fill = "gray30") +
geom_vline(xintercept = ELECTION_EV, colour = "black", size = 2) +
labs(x = "Electoral Votes", y = "density")
Simulation mean, variance, and standard deviations:
sim_results %>%
select(EV) %>%
summarise_all(funs(mean, var, sd))
#> mean var sd
#> 1 352 269 16.4
Theoretical probabilities from a binomial distribution:
# we cannot use n, because mutate will look for n() first.
n_draws <- 1000
pres08 %>%
mutate(pb = pbinom(n_draws / 2, size = n_draws, prob = p,
lower.tail = FALSE)) %>%
summarise(mean = sum(pb * EV),
V = sum(pb * (1 - pb) * EV ^ 2),
sd = sqrt(V))
#> mean V sd
#> 1 352 269 16.4
6.3 Random Variables and Probability Distributions
6.3.1 Bernoulli and Uniform Distributions
Uniform distribution functions:
dunif(0.5, min = 0, max = 1)
#> [1] 1
punif(1, min = -2, max = 2)
#> [1] 0.75
Sample from a uniform distribution, and convert to a probability:
sims <- 1000
p <- 0.5
A Bernoulli distribution is a discrete distribution which randomly samples from 0 and 1. A random variable \(X\) with a Bernoulli distribution with probability parameter \(p\) has the distribution, \[ \begin{aligned} P(X = 1 | p) &= p \\ P(X = 0 | p) &= 1 - p \end{aligned} \]
R does not have a rbernoulli
for the Bernoulli distribution (because as we will see it is a special case of the Binomial distribution, and there are several other ways to sample from it): Here are three ways to sample from a Bernoulli distribution.
First, we can use the sample
from values c(0, 1)
with replacement. By default it will set sample 0 and 1 with equal probability (\(p = 0.5\)), but using the prob
argument we can sample 0 and 1 with difference probabilities. Take 50 samples from a Bernoulli distribution with p = 0.6
,
p <- 0.6
n <- 100
y <- sample(c(0, 1), n, prob = c(1 - p, p), replace = TRUE)
head(y)
#> [1] 1 0 1 1 1 1
mean(y)
#> [1] 0.57
A second method to sample \(n\) values from a Bernoulli distribution that has a probability parameter \(p\) is,
- Take a sample of \(n\) values from a uniform distribution. Call this vector \(x\).
- To generate a sample \(y\) taking values of 0 and 1 from this vector \(x\), set \(y_i = 1\) if \(x_i >= p\), and \(y_i = 0\) if \(x_i < p\).
Note how this method works. Given a value \(p\) between 0 and 1, in a sample from Uniform distribution, in expectation a fraction of \(p\) values will be less than \(p\), and in expectation a fraction of \(1 - p\) values will be greater than \(p\). These are exactly the probabilities of sampling 1 and 0 in the probability mass of the Uniform distribution: {r} y <- as.integer(runif(n, min = 0, max = 1) <= p) head(y) mean(y)
ß
Third, note that the Bernoulli distribution is a special case of the binomial distribution (discussed in the next section), where \(size = 1\). Thus, we can use the rbinom
function to sample from a binomial distribution.
y <- rbinom(n, size = 1, prob = p)
head(y)
#> [1] 1 1 1 0 1 1
mean(y)
#> [1] 0.64
Since the R functions for the Binomial distribution don’t exist here are examples of how you would write them as examples of writing functions to sample and calculate the PDF or PMF, CDF, and quantile functions of a distribution. Sample from the distribution:
rbernoulli <- function(n, prob = 0.5) {
sample(c(1L, 0L), n, replace = FALSE, prob = c(prob, 1 - prob))
}
Probability mass function (PMF):
dbernoulli <- function(x, prob = 0.5) {
d <- rep(NA_real_, length(x))
d[x == 1] <- prob
d[x == 0] <- 1 - prob
d
}
Cumulative density function (CDF): \[ P(X \leq x|p) = \begin{cases} 0 & \text{if } x < 0 , \\ 1 - p & \text{if } 0 \leq x < 1 ,\\ 1 & \text{if } x \geq 1 . \end{cases} \]
pbernoulli <- function(q, prob = 0.5) {
p <- rep(NA_real_, length(q))
p[q < 0] <- 0
p[q >= 0 & q < 1] <- 1 - prob
p[q >= 1] <- 1
p
}
The inverse cumulative density function or quantile function, \[ q = \begin{cases} \emptyset & \text{if } x \leq 1 - p, \\ 0 & \text{if } 1 - p \leq x < 1 ,\\ 1 & \text{if } x \geq 1 . \end{cases} \] where, \[ q \text{ such that } P(X \leq q|p) = x. \]
qbernoulli <- function(p, prob = 0.5) {
q <- rep(NA_integer_, length(p))
q[prob >= 1 - p & prob < 1] <- 0L
q[prob >= 1] <- 1L
q
}
Alternatively, since the Bernoulli distribution is a special case of the Binomial distribution, write *bernoulli
functions that wrap calls to to *binom
functions for the special case where size = 1
:
rbernoulli <- function(n, prob = 0.5, ...) {
rbinom(n, size = 1, prob = prob, ...)
}
pbernoulli <- function(q, prob = 0.5, ...) {
rbinom(q, size = 1, prob = prob, ...)
}
pbernoulli <- function(q, prob = 0.5, ...) {
rbinom(q, size = 1, prob = prob, ...)
}
dbernoulli <- function(x, prob = 0.5, ...) {
rbinom(x, size = 1, prob = prob, ...)
}
6.3.2 Binomial distribution
dbinom(2, size = 3, prob = 0.5)
#> [1] 0.375
pbinom(1, 3, 0.5)
#> [1] 0.5
voters <- c(1000, 10000, 100000)
dbinom(voters / 2, size = voters, prob = 0.5)
#> [1] 0.02523 0.00798 0.00252
6.3.3 Normal distribution
pnorm(1) - pnorm(-1)
#> [1] 0.683
pnorm(2) - pnorm(-2)
#> [1] 0.954
Write a function to calculate the area of a normal distribution \(\pm \sigma\)
normal_pm_sigma <- function(x, mu = 0, sd = 1) {
(pnorm(mu + sd * x, mean = mu, sd = sd) -
pnorm(mu - sd * x, mean = mu, sd = sd))
}
normal_pm_sigma(1)
#> [1] 0.683
normal_pm_sigma(2)
#> [1] 0.954
normal_pm_sigma(1, mu = 5, sd = 2)
#> [1] 0.683
normal_pm_sigma(2, mu = 5, sd = 2)
#> [1] 0.954
data("pres08", package = "qss")
data("pres12", package = "qss")
To join both data frames
pres <-
full_join(select(pres08, state, Obama_2008 = Obama, McCain_2008 = McCain,
EV_2008 = EV),
select(pres12, state, Obama_2012 = Obama, Romney_2012 = Romney,
EV_2012 = EV),
by = "state") %>%
mutate(Obama_2008_z = as.numeric(scale(Obama_2008)),
Obama_2012_z = as.numeric(scale(Obama_2012)))
fit1 <- lm(Obama_2012_z ~ -1 + Obama_2008_z, data = pres)
Plot the residuals and compare them to a normal distribution:
err <- tibble(err = resid(fit1)) %>%
# z-score of residuals
mutate(err_std = err / sd(err))
ggplot(err, aes(x = err_std)) +
geom_histogram(mapping = aes(y = ..density..),
binwidth = 1, boundary = 0) +
stat_function(geom = "line", fun = dnorm) +
scale_x_continuous("Standardized residuals",
breaks = -3:3, limits = c(-3, 3))
ggplot(err, aes(sample = err_std)) +
geom_abline(intercept = 0, slope = 1, color = "white", size = 2) +
geom_qq() +
coord_fixed(ratio = 1) +
scale_y_continuous("Sample quantiles", limits = c(-3, 3)) +
scale_x_continuous("Theoretical quantiles", limits = c(-3, 3))
Alternatively, you can use the augment
function from broom which returns the residuals for each observation in the .resid
column.
augment(fit1) %>%
mutate(.resid_z = .resid / sd(.resid)) %>%
ggplot(aes(x = .resid_z)) +
geom_histogram(mapping = aes(y = ..density..),
binwidth = 1, boundary = 0) +
stat_function(geom = "line", fun = dnorm) +
scale_x_continuous("Standardized residuals",
breaks = -3:3, limits = c(-3, 3))
Obama’s vote shares in 2008 and 2012.
Standard deviation of errors:
err_sd <- sd(resid(fit1))
Probability of having a larger vote in California in 2012 than in 2008?
CA_2008 <- filter(pres, state == "CA")$Obama_2008_z
The predicted value of 2012 vote share can be calculated manually with \(\hat{beta} \times x\),
CA_mean_2012 <- CA_2008 * coef(fit1)["Obama_2008_z"]
CA_mean_2012
#> Obama_2008_z
#> 0.858
or calculated using the predict function with the newdata
argument providing any specific values to calculate?
predict(fit1, newdata = tibble(Obama_2008_z = CA_2008))
#> 1
#> 0.858
Now calculate
pnorm(CA_2008, mean = CA_mean_2012, sd = err_sd,
lower.tail = FALSE)
#> [1] 0.468
We can generalize the previous code to calculate the probability that Obama would exceed his 2008 vote to all states:
pres_gt_2008 <- augment(fit1) %>%
mutate(p_greater = pnorm(Obama_2008_z, mean = .fitted, sd = err_sd,
lower.tail = FALSE))
Plotting these results, we can observe regression to the mean. States with larger (smaller) 2008 vote shares had a lower (higher) probability that the 2012 vote share will exceed them.
ggplot(pres_gt_2008, aes(x = Obama_2008_z, y = p_greater)) +
modelr::geom_ref_line(h = 0.5) +
modelr::geom_ref_line(v = 0) +
geom_point() +
labs(x = "Standardized vote share of Obama (2008)",
y = "Pr. 2012 vote share greater than 2008")
6.3.4 Expectation and Variance
Theoretical and actual sample variable of a set of Bernoulli draws:
p <- 0.5
# theretical variance
p * (1 - p)
#> [1] 0.25
# a sample
y <- sample(c(0, 1), size = 1000, replace = TRUE, prob = c(p, 1 - p))
# the sample variance of that sample
var(y)
#> [1] 0.25
6.3.5 Predicting Election Outcomes with Uncertainty
Load 2008 Presidential Election data:
data("pres08", package = "qss")
pres08 <- mutate(pres08, p = Obama / (Obama + McCain))
sim_election <- function(x, n = 1000) {
n_states <- nrow(x)
# samples the number of votes for Obama
mutate(x,
draws = rbinom(n_states, size = !!n, prob = p),
obama_EV = EV * (draws > (!!n / 2))) %>%
pluck("obama_EV") %>%
sum()
}
This function returns the electoral votes for Obama for a single simulation:
sim_election(pres08)
#> [1] 353
Run this simulation sims
times, saving the electoral votes of Obama in each simulation:
sims <- 10000
Obama_EV_sims <- map_dbl(seq_len(sims), ~ sim_election(pres08))
Obama’s actual electoral value
OBAMA_EV <- 364
library("glue")
ggplot(tibble(Obama_EV = Obama_EV_sims),
aes(x = Obama_EV, y = ..density..)) +
geom_histogram(binwidth = 10, boundary = 0, fill = "gray60") +
geom_vline(xintercept = OBAMA_EV, colour = "black", size = 2) +
annotate("text", x = OBAMA_EV + 2, y = 0.02,
label = glue("Obama's 2008 EV = {OBAMA_EV}"), hjust = 0) +
scale_x_continuous("Obama's Electoral College Votes",
breaks = seq(300, 400, by = 20)) +
scale_y_continuous("Density") +
labs(title = "Prediction of election outcomes")
Summarize the simulations:
summary(Obama_EV_sims)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 289 340 353 352 364 398
Compare theoretical and simulation means:
# simulation EV
mean(Obama_EV_sims)
#> [1] 352
# theoretical
n <- 1000
pres08 %>%
mutate(Obama_EV = EV * pbinom(n / 2, size = n, prob = p,
lower.tail = FALSE)) %>%
summarise(Obama_EV = sum(Obama_EV))
#> Obama_EV
#> 1 352
Compare theoretical and simulation variances:
# simulation variance
var(Obama_EV_sims)
#> [1] 269
# theoretical variance
Obama_EV_var <- pres08 %>%
mutate(pb = pbinom(n / 2, size = n, prob = p, lower.tail = FALSE),
EV_var = pb * (1 - pb) * EV ^ 2) %>%
summarise(EV_var = sum(EV_var)) %>%
pluck("EV_var")
Obama_EV_var
#> [1] 269
and standard deviations
# sim
sd(Obama_EV_sims)
#> [1] 16.4
# theoretical
sqrt(Obama_EV_var)
#> [1] 16.4
6.4 Large Sample Theorems
6.4.1 Law of Large Numbers
Put the simulation number, x
and mean
in a tibble:
sims <- 1000
p <- 0.2
size <- 10
lln_binom <- tibble(
n = seq_len(sims),
x = rbinom(sims, prob = p, size = size),
mean = cumsum(x) / n,
distrib = str_c("Binomial(", size, ", ", p, ")"))
lln_unif <-
tibble(n = seq_len(sims),
x = runif(sims),
mean = cumsum(x) / n,
distrib = str_c("Uniform(0, 1)"))
true_means <-
tribble(~distrib, ~mean,
"Uniform(0, 1)", 0.5,
str_c("Binomial(", size, ", ", p, ")"), size * p)
ggplot() +
geom_hline(aes(yintercept = mean), data = true_means,
colour = "white", size = 2) +
geom_line(aes(x = n, y = mean),
data = bind_rows(lln_binom, lln_unif)) +
facet_grid(distrib ~ ., scales = "free_y") +
labs(x = "Sample Size", y = "Sample Mean")
6.4.2 Central Limit Theorem
The population mean of the binomial distribution is \(\mu = p n\) and the variance is \(\sigma^2 = p (1 - p) n\).
sims <- 1000
n_samp <- 1000
Write functions to calculate the mean of a binomial distribution with size, size
, and probability, p
,
binom_mean <- function(size, p) {
size * p
}
variance of a binomial distribution,
binom_var <- function(size, p) {
size * p * (1 - p)
}
Write a function that takes n_samp
samples from a binomial distribution with size, size
, and probability of success, p
, and returns a data frame with the z-score of the sample distribution:
sim_binom_clt <- function(n_samp, size, p) {
x <- rbinom(n_samp, prob = p, size = size)
z <- (mean(x) - binom_mean(size, p)) /
sqrt(binom_var(size, p) / n_samp)
tibble(distrib = str_c("Binomial(", p, ", ", size, ")"),
z = z)
}
For the uniform distribution, we need a function to calculate the mean of a uniform distribution,
unif_mean <- function(min, max) {
0.5 * (min + max)
}
variance of a uniform distribution,
unif_var <- function(min, max) {
(1 / 12) * (max - min) ^ 2
}
and a function to perform the CLT simulation,
sim_unif_clt <- function(n_samp, min = 0, max = 1) {
x <- runif(n_samp, min = min, max = max)
z <- (mean(x) - unif_mean(min, max)) /
sqrt(unif_var(min, max) / n_samp)
tibble(distrib = str_c("Uniform(", min, ", ", max, ")"),
z = z)
}
Since we will calculate this for n_samp = 1000
and n_samp
, we might as well write a function for it.
clt_plot <- function(n_samp) {
bind_rows(map_df(seq_len(sims), ~ sim_binom_clt(n_samp, size, p)),
map_df(seq_len(sims), ~ sim_unif_clt(n_samp))) %>%
ggplot(aes(x = z)) +
geom_density() +
geom_rug() +
stat_function(fun = dnorm, colour = "red") +
facet_grid(distrib ~ .) +
ggtitle(str_c("Sample size = ", n_samp))
}
clt_plot(1000)
clt_plot(100)