3 Measurement

Prerequisites

library("tidyverse")
library("forcats")
library("broom")
library("tidyr")

3.1 Measuring Civilian Victimization during Wartime

data("afghan", package = "qss")

Summarize the variables of interest

afghan %>%
  select(age, educ.years, employed, income) %>%
  summary()
#>       age         educ.years    employed        income         
#>  Min.   :15.0   Min.   : 0   Min.   :0.000   Length:2754       
#>  1st Qu.:22.0   1st Qu.: 0   1st Qu.:0.000   Class :character  
#>  Median :30.0   Median : 1   Median :1.000   Mode  :character  
#>  Mean   :32.4   Mean   : 4   Mean   :0.583                     
#>  3rd Qu.:40.0   3rd Qu.: 8   3rd Qu.:1.000                     
#>  Max.   :80.0   Max.   :18   Max.   :1.000

Loading data with either data() orread_csv() does not convert strings to factors by default; see below with income. To get a summary of the different levels, either convert it to a factor (see R4DS Ch 15), or use count():

count(afghan, income)
#> # A tibble: 6 x 2
#>   income              n
#>   <chr>           <int>
#> 1 10,001-20,000     616
#> 2 2,001-10,000     1420
#> 3 20,001-30,000      93
#> 4 less than 2,000   457
#> 5 over 30,000        14
#> 6 <NA>              154

Use count to calculate the proportion of respondents who answer that they were harmed by the ISAF or the Taliban (violent.exp.ISAF and violent.exp.taliban, respectively):

afghan %>%
  group_by(violent.exp.ISAF, violent.exp.taliban) %>%
  count() %>%
  ungroup() %>%
  mutate(prop = n / sum(n))
#> # A tibble: 9 x 4
#>   violent.exp.ISAF violent.exp.taliban     n    prop
#>              <int>               <int> <int>   <dbl>
#> 1                0                   0  1330 0.483  
#> 2                0                   1   354 0.129  
#> 3                0                  NA    22 0.00799
#> 4                1                   0   475 0.172  
#> 5                1                   1   526 0.191  
#> 6                1                  NA    22 0.00799
#> # ... with 3 more rows

We need to use ungroup() in order to ensure that sum(n) sums over the entire dataset as opposed to only within categories of violent.exp.ISAF. Unlike prop.table(), the code above does not drop missing values. We can drop those values by adding filter() and !is.na() to test for missing values in those variables:

afghan %>%
  filter(!is.na(violent.exp.ISAF), !is.na(violent.exp.taliban)) %>%
  group_by(violent.exp.ISAF, violent.exp.taliban) %>%
  count() %>%
  ungroup() %>%
  mutate(prop = n / sum(n))
#> # A tibble: 4 x 4
#>   violent.exp.ISAF violent.exp.taliban     n  prop
#>              <int>               <int> <int> <dbl>
#> 1                0                   0  1330 0.495
#> 2                0                   1   354 0.132
#> 3                1                   0   475 0.177
#> 4                1                   1   526 0.196

3.2 Handling Missing Data in R

We already observed the issues with NA values in calculating the proportion answering the “experienced violence” questions. You can filter rows with specific variables having missing values using filter() as shown above.

head(afghan$income, n = 10)
#>  [1] "2,001-10,000"  "2,001-10,000"  "2,001-10,000"  "2,001-10,000" 
#>  [5] "2,001-10,000"  NA              "10,001-20,000" "2,001-10,000" 
#>  [9] "2,001-10,000"  NA
head(is.na(afghan$income), n = 10)
#>  [1] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE

Counts and proportion of missing values of income:

summarise(afghan,
          n_missing = sum(is.na(income)),
          p_missing = mean(is.na(income)))
#>   n_missing p_missing
#> 1       154    0.0559

Mean, and other functions, do not by default exclude missing values. Use na.rm = TRUE in these cases.

x <- c(1, 2, 3, NA)
mean(x)
#> [1] NA
mean(x, na.rm = TRUE)
#> [1] 2

Table of proportions of individuals harmed by the ISAF and Taliban that includes missing (NA) values:

violent_exp_prop <-
  afghan %>%
  group_by(violent.exp.ISAF, violent.exp.taliban) %>%
  count() %>%
  ungroup() %>%
  mutate(prop = n / sum(n)) %>%
  select(-n)
violent_exp_prop
#> # A tibble: 9 x 3
#>   violent.exp.ISAF violent.exp.taliban    prop
#>              <int>               <int>   <dbl>
#> 1                0                   0 0.483  
#> 2                0                   1 0.129  
#> 3                0                  NA 0.00799
#> 4                1                   0 0.172  
#> 5                1                   1 0.191  
#> 6                1                  NA 0.00799
#> # ... with 3 more rows

The data frame above can be reorganized so that rows are ISAF and the columns are Taliban as follows:

violent_exp_prop %>%
  spread(violent.exp.taliban, prop)
#> # A tibble: 3 x 4
#>   violent.exp.ISAF     `0`     `1`  `<NA>`
#>              <int>   <dbl>   <dbl>   <dbl>
#> 1                0 0.483   0.129   0.00799
#> 2                1 0.172   0.191   0.00799
#> 3               NA 0.00254 0.00290 0.00363

drop_na is an alternative to na.omit that allows for removing missing values,

drop_na(afghan)

Tip There are multiple types of missing values.

NA  # logical
#> [1] NA
NA_integer_ # integer
#> [1] NA
NA_real_ # double
#> [1] NA
NA_character_ # character
#> [1] NA

In many cases, this distinction does not matter since many functions will coerce these missing values to the correct vector type. However, you will need to use these in some tidyverse functions that require the outputs to be the same type, e.g. map() and most of the other purrr functions, and if_else(). The code below produces an error, since the TRUE case returns an integer value (x is an integer), but the FALSE case does not specify the type of NA.

x <- 1:5
class(x)
#> [1] "integer"
if_else(x < 3, x, NA)
#> Error: `false` must be type integer, not logical

So instead of NA, use NA_integer_:

if_else(x < 3, x, NA_integer_)
#> [1]  1  2 NA NA NA

3.3 Visualizing the Univariate Distribution

3.3.1 Barplot

afghan <-
  afghan %>%
  mutate(violent.exp.ISAF.fct =
           fct_explicit_na(fct_recode(factor(violent.exp.ISAF),
                                      Harm = "1", "No Harm" = "0"),
                           "No response"))
ggplot(afghan, aes(x = violent.exp.ISAF.fct, y = ..prop.., group = 1)) +
  geom_bar() +
  xlab("Response category") +
  ylab("Proportion of respondents") +
  ggtitle("Civilian Victimization by the ISAF")

afghan <-
  afghan %>%
  mutate(violent.exp.taliban.fct =
           fct_explicit_na(fct_recode(factor(violent.exp.taliban),
                                      Harm = "1", "No Harm" = "0"),
                           "No response"))
ggplot(afghan, aes(x = violent.exp.ISAF.fct, y = ..prop.., group = 1)) +
  geom_bar() +
  xlab("Response category") +
  ylab("Proportion of respondents") +
  ggtitle("Civilian Victimization by the Taliban")

Instead of creating two separate box-plots, create a single plot facetted by ISAF and Taliban:

select(afghan, violent.exp.ISAF, violent.exp.taliban) %>%
  gather(variable, value) %>%
  mutate(value = fct_explicit_na(fct_recode(factor(value),
                                Harm = "1", "No Harm" = "0"),
                                "No response"),
         variable = recode(variable,
                           violent.exp.ISAF = "ISAF",
                           violent.exp.taliban = "Taliban")) %>%
  ggplot(aes(x = value, y = ..prop.., group = 1)) +
  geom_bar() +
  facet_wrap(~ variable, ncol = 1) +
  xlab("Response category") +
  ylab("Proportion of respondents") +
  ggtitle("Civilian Victimization")

This plot could improved by plotting the two values simultaneously to be able to better compare them. This will require creating a data frame that has the following columns: perpetrator (ISAF, Taliban) and response (No Harm, Harm, No response).

violent_exp <-
  afghan %>%
  select(violent.exp.ISAF, violent.exp.taliban) %>%
  gather(perpetrator, response) %>%
  mutate(perpetrator = str_replace(perpetrator, "violent\\.exp\\.", ""),
         perpetrator = str_replace(perpetrator, "taliban", "Taliban"),
         response = fct_recode(factor(response), "Harm" = "1", "No Harm" = "0"),
         response = fct_explicit_na(response, "No response"),
         response = fct_relevel(response, c("No response", "No Harm"))
         ) %>%
  count(perpetrator, response) %>%
  mutate(prop = n / sum(n))
ggplot(violent_exp, aes(x = prop, y = response, color = perpetrator)) +
  geom_point() +
  scale_color_manual(values = c(ISAF = "green", Taliban = "black"))

Black was chosen for the Taliban, and Green for ISAF because they are the colors of their respective flags.

3.3.2 Histogram

See the documentation for geom_histogram.

ggplot(afghan, aes(x = age, y = ..density..)) +
  geom_histogram(binwidth = 5, boundary = 0) +
  scale_x_continuous(breaks = seq(20, 80, by = 10)) +
  labs(title = "Distribution of respondent's age",
       y = "Age", x = "Density")

ggplot(afghan, aes(x = educ.years, y = ..density..)) +
  geom_histogram(binwidth = 1, center = 0) +
  geom_vline(xintercept = median(afghan$educ.years),
             color = "white", size = 2) +
  annotate("text", x = median(afghan$educ.years),
           y = 0.2, label = "median", hjust = 0) +
  labs(title = "Distribution of respondent's education",
       x = "Years of education",
       y = "Density")

There are several alternatives to the histogram.

Density plots (geom_density):

dens_plot <- ggplot(afghan, aes(x = age)) +
  geom_density() +
  scale_x_continuous(breaks = seq(20, 80, by = 10)) +
  labs(title = "Distribution of respondent's age",
       y = "Age", x = "Density")
dens_plot

which can be combined with a geom_rug to create a rug plot, which puts small lines on the axis to represent the value of each observation. It can be combined with a scatter or density plot to add extra detail. Adjust the alpha to modify the color transparency of the rug and address overplotting.

dens_plot + geom_rug(alpha = .2)

Frequency polygons (geom_freqpoly): See R for Data Science EDA.

ggplot(afghan, aes(x = age)) +
  geom_freqpoly() +
  scale_x_continuous(breaks = seq(20, 80, by = 10)) +
  labs(title = "Distribution of respondent's age",
       y = "Age", x = "Density")
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

3.3.3 Boxplot

See the documentation for geom_boxplot.

ggplot(afghan, aes(x = 1, y = age)) +
  geom_boxplot() +
  coord_flip() +
  labs(y = "Age", x = "", title = "Distribution of Age")

ggplot(afghan, aes(y = educ.years, x = province)) +
  geom_boxplot() +
  coord_flip() +
  labs(x = "Province", y = "Years of education",
       title = "Education by Province")

Helmand and Uruzgan have much lower levels of education than the other provinces, and also report higher levels of violence.

afghan %>%
  group_by(province) %>%
  summarise(educ.years = mean(educ.years, na.rm = TRUE),
            violent.exp.taliban =
              mean(violent.exp.taliban, na.rm = TRUE),
            violent.exp.ISAF =
              mean(violent.exp.ISAF, na.rm = TRUE)) %>%
  arrange(educ.years)
#> # A tibble: 5 x 4
#>   province educ.years violent.exp.taliban violent.exp.ISAF
#>   <chr>         <dbl>               <dbl>            <dbl>
#> 1 Uruzgan        1.04              0.455             0.496
#> 2 Helmand        1.60              0.504             0.541
#> 3 Khost          5.79              0.233             0.242
#> 4 Kunar          5.93              0.303             0.399
#> 5 Logar          6.70              0.0802            0.144

An alternatives to the traditional boxplot:

The Tufte boxplot:

library("ggthemes")
ggplot(afghan, aes(y = educ.years, x = province)) +
  geom_tufteboxplot() +
  coord_flip() +
  labs(x = "Province", y = "Years of education",
       title = "Education by Province")

Dot plot with jitter and adjusted alpha to avoid overplotting:

ggplot(afghan, aes(y = educ.years, x = province)) +
  geom_point(position = position_jitter(width = 0.25, height = 0),
             alpha = .2) +
  coord_flip() +
  labs(x = "Province", y = "Years of education",
       title = "Education by Province")

A violin plot:

ggplot(afghan, aes(y = educ.years, x = province)) +
  geom_violin() +
  coord_flip() +
  labs(x = "Province", y = "Years of education",
       title = "Education by Province")

3.3.4 Printing and saving graphics

Use the function rdoc ("ggplot2", "ggsave") to save ggplot2 graphics. Also, R Markdown files have their own means of creating and saving plots created by code-chunks.

3.4 Survey Sampling

3.4.1 The Role of Randomization

data("afghan.village", package = "qss")

Box-plots of altitude

ggplot(afghan.village, aes(x = factor(village.surveyed,
                                      labels = c("sampled", "non-sampled")),
                           y = altitude)) +
  geom_boxplot() +
  labs(y = "Altitude (meter)", x = "") +
  coord_flip()

Box plots log-population values of sampled and non-sampled

ggplot(afghan.village, aes(x = factor(village.surveyed,
                                      labels = c("sampled", "non-sampled")),
                           y = log(population))) +
  geom_boxplot() +
  labs(y = "log(population)", x = "") +
  coord_flip()

You can also compare these distributions by plotting their densities:

ggplot(afghan.village, aes(colour = factor(village.surveyed,
                                      labels = c("sampled", "non-sampled")),
                           x = log(population))) +
  geom_density() +
  geom_rug() +
  labs(x = "log(population)", colour = "")

3.4.2 Non-response and other sources of bias

Calculate the rates of item non-response by province to the question about civilian victimization by ISAF and Taliban forces (violent.exp.ISAF and violent.exp.taliban):

afghan %>%
  group_by(province) %>%
  summarise(ISAF = mean(is.na(violent.exp.ISAF)),
            taliban = mean(is.na(violent.exp.taliban))) %>%
  arrange(-ISAF)
#> # A tibble: 5 x 3
#>   province    ISAF taliban
#>   <chr>      <dbl>   <dbl>
#> 1 Uruzgan  0.0207  0.0620 
#> 2 Helmand  0.0164  0.0304 
#> 3 Khost    0.00476 0.00635
#> 4 Kunar    0       0      
#> 5 Logar    0       0

Calculate the proportion who support the ISAF using the difference in means between the ISAF and control groups:

(mean(filter(afghan, list.group == "ISAF")$list.response) -
  mean(filter(afghan, list.group == "control")$list.response))
#> [1] 0.049

To calculate the table responses to the list experiment in the control, ISAF, and Taliban groups:

afghan %>%
  group_by(list.response, list.group) %>%
  count() %>%
  glimpse() %>%
  spread(list.group, n, fill = 0)
#> Observations: 12
#> Variables: 3
#> $ list.response <int> 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4
#> $ list.group    <chr> "control", "ISAF", "control", "ISAF", "taliban",...
#> $ n             <int> 188, 174, 265, 278, 433, 265, 260, 287, 200, 182...
#> # A tibble: 5 x 4
#> # Groups:   list.response [5]
#>   list.response control  ISAF taliban
#>           <int>   <dbl> <dbl>   <dbl>
#> 1             0     188 174         0
#> 2             1     265 278       433
#> 3             2     265 260       287
#> 4             3     200 182       198
#> 5             4       0  24.0       0

3.5 Measuring Political Polarization

data("congress", package = "qss")

glimpse(congress)
#> Observations: 14,552
#> Variables: 7
#> $ congress <int> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8...
#> $ district <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 98, 98, 1, 2, 3, 4, 5, ...
#> $ state    <chr> "USA", "ALABAMA", "ALABAMA", "ALABAMA", "ALABAMA", "A...
#> $ party    <chr> "Democrat", "Democrat", "Democrat", "Democrat", "Demo...
#> $ name     <chr> "TRUMAN", "BOYKIN  F.", "GRANT  G.", "ANDREWS  G.", "...
#> $ dwnom1   <dbl> -0.276, -0.026, -0.042, -0.008, -0.082, -0.170, -0.12...
#> $ dwnom2   <dbl> 0.016, 0.796, 0.999, 1.005, 1.066, 0.870, 0.990, 0.89...

q <-
  congress %>%
  filter(congress %in% c(80, 112),
         party %in% c("Democrat", "Republican")) %>%
  ggplot(aes(x = dwnom1, y = dwnom2, colour = party)) +
  geom_point() +
  facet_wrap(~ congress) +
  coord_fixed() +
  scale_y_continuous("racial liberalism/conservatism",
                     limits = c(-1.5, 1.5)) +
  scale_x_continuous("economic liberalism/conservatism",
                     limits = c(-1.5, 1.5))
q

However, since there are colors associated with Democrats (blue) and Republicans (blue), we should use them rather than the defaults. There’s some evidence that using semantically-resonant colors can help decoding data visualizations (See Lin, et al. 2013). Since I’ll reuse the scale several times, I’ll save it in a variable.

scale_colour_parties <-
  scale_colour_manual(values = c(Democrat = "blue",
                                 Republican = "red",
                                 Other = "green"))
q + scale_colour_parties

congress %>%
  ggplot(aes(x = dwnom1, y = dwnom2, colour = party)) +
  geom_point() +
  facet_wrap(~ congress) +
  coord_fixed() +
  scale_y_continuous("racial liberalism/conservatism",
                     limits = c(-2, 2)) +
  scale_x_continuous("economic liberalism/conservatism",
                     limits = c(-2, 2))
  #scale_colour_parties

congress %>%
  group_by(congress, party) %>%
  summarise(dwnom1 = mean(dwnom1)) %>%
  filter(party %in% c("Democrat", "Republican")) %>%
  ggplot(aes(x = congress, y = dwnom1,
             colour = fct_reorder2(party, congress, dwnom1))) +
  geom_line() +
  scale_colour_parties +
  labs(y = "DW-NOMINATE score (1st Dimension)", x = "Congress",
       colour = "Party")

Alternatively, you can plot the mean DW-Nominate scores for each party and congress over time. This plot uses color for parties and lets the points and labels for the first and last congresses (80 and 112) to convey progress through time.

party_means <-
  congress %>%
  filter(party %in% c("Democrat", "Republican")) %>%
  group_by(party, congress) %>%
  summarise(dwnom1 = mean(dwnom1),
            dwnom2 = mean(dwnom2))

party_endpoints <-
  party_means %>%
  filter(congress %in% c(min(congress), max(congress))) %>%
  mutate(label = str_c(party, congress, sep = " - "))

ggplot(party_means,
         aes(x = dwnom1, y = dwnom2, color = party,
             group = party)) +
  geom_point() +
  geom_path() +
  ggrepel::geom_text_repel(data = party_endpoints,
                           mapping = aes(label = congress),
                           color = "black") +
  scale_y_continuous("racial liberalism/conservatism") +
  scale_x_continuous("economic liberalism/conservatism") +
  scale_colour_parties

3.5.1 Correlation

Let’s plot the Gini coefficient

data("USGini", package = "qss")

ggplot(USGini, aes(x = year, y = gini)) +
  geom_point() +
  geom_line() +
  labs(x = "Year", y = "Gini coefficient") +
  ggtitle("Income Inequality")

To calculate a measure of party polarization take the code used in the plot of Republican and Democratic party median ideal points and adapt it to calculate the difference in the party medians:

party_polarization <-
  congress %>%
  group_by(congress, party) %>%
  summarise(dwnom1 = mean(dwnom1)) %>%
  filter(party %in% c("Democrat", "Republican")) %>%
  spread(party, dwnom1) %>%
  mutate(polarization = Republican - Democrat)
party_polarization
#> # A tibble: 33 x 4
#> # Groups:   congress [33]
#>   congress Democrat Republican polarization
#>      <int>    <dbl>      <dbl>        <dbl>
#> 1       80   -0.146      0.276        0.421
#> 2       81   -0.195      0.264        0.459
#> 3       82   -0.180      0.265        0.445
#> 4       83   -0.181      0.261        0.442
#> 5       84   -0.209      0.261        0.471
#> 6       85   -0.214      0.250        0.464
#> # ... with 27 more rows

ggplot(party_polarization, aes(x = congress, y = polarization)) +
  geom_point() +
  geom_line() +
  ggtitle("Political Polarization") +
  labs(x = "Year", y = "Republican median − Democratic median")

3.5.2 Quantile-Quantile Plot

congress %>%
  filter(congress == 112, party %in% c("Republican", "Democrat")) %>%
  ggplot(aes(x = dwnom2, y = ..density..)) +
  geom_histogram(binwidth = .2) +
  facet_grid(party ~ .) +
  labs(x = "racial liberalism/conservatism dimension")

The package ggplot2 includes a function stat_qq which can be used to create qq-plots but it is more suited to comparing a sample distribution with a theoretical distribution, usually the normal one. However, we can calculate one by hand, which may give more insight into exactly what the qq-plot is doing.

party_qtiles <- tibble(
  probs = seq(0, 1, by = 0.01),
  Democrat = quantile(filter(congress, congress == 112,
                             party == "Democrat")$dwnom2,
         probs = probs),
  Republican = quantile(filter(congress, congress == 112,
                               party == "Republican")$dwnom2,
         probs = probs)
)
party_qtiles
#> # A tibble: 101 x 3
#>    probs Democrat Republican
#>    <dbl>    <dbl>      <dbl>
#> 1 0        -0.925     -1.38 
#> 2 0.0100   -0.672     -0.720
#> 3 0.0200   -0.619     -0.566
#> 4 0.0300   -0.593     -0.526
#> 5 0.0400   -0.567     -0.468
#> 6 0.0500   -0.560     -0.436
#> # ... with 95 more rows

The plot looks different than the one in the text since the x- and y-scales are in the original values instead of z-scores (see the next section).

party_qtiles %>%
  ggplot(aes(x = Democrat, y = Republican)) +
  geom_point() +
  geom_abline() +
  coord_fixed()

3.6 Clustering

3.6.1 Matrices

While matrices are great for numerical computations, such as when you are implementing algorithms, generally keeping data in data frames is more convenient for data wrangling.

See R for Data Science chapter Vectors.

3.6.2 Lists

See R for Data Science chapters Vectors and Iteration, as well as the purrr package for more powerful methods of computing on lists.

3.6.3 k-means algorithms

Calculate the clusters by the 80th and 112th congresses:

k80two.out <-
  kmeans(select(filter(congress, congress == 80),
                       dwnom1, dwnom2),
              centers = 2, nstart = 5)

Add the cluster ids to data sets:

congress80 <-
  congress %>%
  filter(congress == 80) %>%
  mutate(cluster2 = factor(k80two.out$cluster))

We will also create a data sets with the cluster centroids. These are in the centers element of the cluster object.

k80two.out$centers
#>    dwnom1 dwnom2
#> 1 -0.0484  0.783
#> 2  0.1468 -0.339

To make it easier to use with ggplot2, we need to convert this to a data frame. The tidy function from the broom package:

k80two.clusters <- tidy(k80two.out)
k80two.clusters
#>        x1     x2 size withinss cluster
#> 1 -0.0484  0.783  135     10.9       1
#> 2  0.1468 -0.339  311     54.9       2

Plot the ideal points and clusters:

ggplot() +
  geom_point(data = congress80,
             aes(x = dwnom1, y = dwnom2, colour = cluster2)) +
  geom_point(data = k80two.clusters, mapping = aes(x = x1, y = x2))

congress80 %>%
  group_by(party, cluster2) %>%
  count()
#> # A tibble: 5 x 3
#> # Groups:   party, cluster2 [5]
#>   party      cluster2     n
#>   <chr>      <fct>    <int>
#> 1 Democrat   1          132
#> 2 Democrat   2           62
#> 3 Other      2            2
#> 4 Republican 1            3
#> 5 Republican 2          247

And now we can repeat these steps for the 112th congress:

k112two.out <-
  kmeans(select(filter(congress, congress == 112),
                dwnom1, dwnom2),
         centers = 2, nstart = 5)
congress112 <-
  filter(congress, congress == 112) %>%
  mutate(cluster2 = factor(k112two.out$cluster))
k112two.clusters <- tidy(k112two.out)
ggplot() +
  geom_point(data = congress112,
             mapping = aes(x = dwnom1, y = dwnom2, colour = cluster2)) +
  geom_point(data = k112two.clusters,
             mapping = aes(x = x1, y = x2))

Number of observations from each party in each cluster:

congress112 %>%
  group_by(party, cluster2) %>%
  count()
#> # A tibble: 3 x 3
#> # Groups:   party, cluster2 [3]
#>   party      cluster2     n
#>   <chr>      <fct>    <int>
#> 1 Democrat   1          200
#> 2 Republican 1            1
#> 3 Republican 2          242

Now repeat the same with four clusters on the 80th congress:

k80four.out <-
  kmeans(select(filter(congress, congress == 80),
                dwnom1, dwnom2),
         centers = 4, nstart = 5)
congress80 <-
  filter(congress, congress == 80) %>%
  mutate(cluster2 = factor(k80four.out$cluster))
k80four.clusters <- tidy(k80four.out)
ggplot() +
  geom_point(data = congress80,
             mapping = aes(x = dwnom1, y = dwnom2, colour = cluster2)) +
  geom_point(data = k80four.clusters,
             mapping = aes(x = x1, y = x2), size = 3)

and on the 112th congress:

k112four.out <-
  kmeans(select(filter(congress, congress == 112),
                dwnom1, dwnom2),
         centers = 4, nstart = 5)
congress112 <-
  filter(congress, congress == 112) %>%
  mutate(cluster2 = factor(k112four.out$cluster))
k112four.clusters <- tidy(k112four.out)
ggplot() +
  geom_point(data = congress112,
             mapping = aes(x = dwnom1, y = dwnom2, colour = cluster2)) +
  geom_point(data = k112four.clusters,
             mapping = aes(x = x1, y = x2), size = 3)