21 Multivariate Missing Data

\[ \DeclareMathOperator{diag}{diag} \]

library("tidyverse")
library("rstan")

This example shows how to impute missing data. See Stan Development Team (2016), Chapter 10 “Missing Data & Partially Known Parameters” for more discussion.¹⁷

Consider a data set of 10 observations on 3 variables Only one of the variables, $z$, is completely observed. The other two variables, x$ and $y$, have a non-overlapping pattern of missing data.

multivarmissing <- tribble(
  ~x, ~y, ~z,
1,  NA, NA,
2,  NA, 4,
3,  NA, 3,
4,  NA, 5,
5,  NA, 7,
NA, 7,  9,
NA, 8,  8,
NA, 9,  11,
NA, 8,  10,
NA, 9,  8)

The missing elements of $x$ and $y$ are parameters, and the observed elements of $x$, $y$, and $z$ are data. These are combined in the transformed parameters block, and modeled.

21.1 Separate Regressions

We use $z$ to predict $x$, and $z$ and $x$ (both observed and imputed) to impute $y$.

\[ \begin{aligned}[t] x_i &\sim \mathsf{Normal}(\mu_{x,i}, \sigma_x) \\ \mu_{x,i} &= \gamma_1 + \gamma_2 z_i \\ y_i &\sim \mathsf{Normal}(\mu_{y,i}, \sigma_y) \\ \mu_{y,i} &= \beta_1 + \beta_2 y_i + \beta_3 z_i \\ z_i &\sim \mathsf{Normal}(\mu_z, \sigma_z) \end{aligned} \]

The parameters are given weakly informative parameters: \[ \begin{aligned}[t] \sigma_x,\sigma_y,\sigma_z &\sim \mathsf{HalfCauchy}(0, 5) \\ \gamma_1, \beta_1 &\sim \mathsf{Normal}(0, 10) \\ \gamma_2, \beta_2, \beta_3 &\sim \mathsf{Normal}(0, 2.5) \end{aligned} \] Note that this assumes that $x$, $y$, and $z$ are standardized to have zero mean and unit variance.

data_multivarmissing <- within(list(), {
  N <- nrow(multivarmissing)
  x_obs <- multivarmissing$x[!is.na(multivarmissing$x)] %>%
    scale() %>% as.numeric()
  x_obs_idx <- array(which(!is.na(multivarmissing$x)))
  N_x_obs <- length(x_obs_idx)  
  x_miss_idx <- array(which(is.na(multivarmissing$x)))
  N_x_miss <- length(x_miss_idx)
  y_obs <- multivarmissing$y[!is.na(multivarmissing$y)] %>%
    scale() %>% as.numeric()    
  y_obs_idx <- array(which(!is.na(multivarmissing$y)))
  N_y_obs <- length(y_obs_idx)  
  y_miss_idx <- array(which(is.na(multivarmissing$y)))
  N_y_miss <- length(y_miss_idx)
  z_obs <- multivarmissing$z[!is.na(multivarmissing$z)] %>%
    scale() %>% as.numeric()
  z_obs_idx <- array(which(!is.na(multivarmissing$z)))
  N_z_obs <- length(z_obs_idx)
  z_miss_idx <- array(which(is.na(multivarmissing$z)))
  N_z_miss <- length(z_miss_idx)
  alpha_loc <- 0
  alpha_scale <- 10
  beta_loc <- rep(0, 3)
  beta_scale <- c(10, 2.5, 2.5)
  gamma_loc <- rep(0, 2)
  gamma_scale <- c(10, 2.5)
  sigma_x_scale <- 5
  sigma_y_scale <- 5
  sigma_z_scale <- 5
})

mod_multivarmissing <- stan_model("stan/multivarmissing2.stan")

mod_multivarmissing

  data {
  // number of obs
  int N;
  // number of variables
  int K;
  // X
  int N_obs;
  vector[N_obs] X_obs;
  int X_obs_row[N_obs];
  int X_obs_col[N_obs];
  int N_miss;
  int X_miss_row[N_miss];
  int X_miss_col[N_miss];
  // priors
  vector[K] Sigma_scale_scale;
  real Sigma_corr_L_eta;
  vector[K] mu_loc;
  vector[K] mu_scale;
}
parameters {
  vector[K] mu;
  vector[K] Sigma_scale;
  cholesky_factor_corr[K] Sigma_corr_L;
  vector[N_miss] X_miss;
}
transformed parameters {
  vector[K] X[N];
  for (i in 1:N_obs) {
   X[X_obs_row[i], X_obs_col[i]] = X_obs[i];
  }
  for (i in 1:N_miss) {
    X[X_miss_row[i], X_miss_col[i]] = X_miss[i];
  }
}
model {
  Sigma_corr_L ~ lkj_corr_cholesky(Sigma_corr_L_eta);
  Sigma_scale ~ cauchy(0., Sigma_scale_scale);
  for (i in 1:N) {
    X[i] ~ multi_normal_cholesky(mu, Sigma_corr_L);
  }
}

fit_multivarmissing <-
  sampling(mod_multivarmissing, data = data_multivarmissing)
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name K is not numeric and not used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name N_obs is not numeric and not
#> used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name X_obs is not numeric and not
#> used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name X_obs_row is not numeric and
#> not used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name X_obs_col is not numeric and
#> not used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name N_miss is not numeric and not
#> used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name X_miss_row is not numeric and
#> not used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name X_miss_col is not numeric and
#> not used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name Sigma_scale_scale is not
#> numeric and not used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name Sigma_corr_L_eta is not numeric
#> and not used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name mu_loc is not numeric and not
#> used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name mu_scale is not numeric and not
#> used
#> failed to create the sampler; sampling not done

fit_multivarmissing
#> Stan model 'multivarmissing2' does not contain samples.

21.2 Multivariate Normal

Alternatively, $x$, $y$, and $z$ could be modeled as coming from a multivariate normal distribution. \[ \begin{bmatrix} x_i \\ y_i \\ z_i \end{bmatrix} \sim \mathsf{Normal}(\mu, \Sigma) \] where $\mu$ and $\Sigma$ are given weakly informative priors, \[ \begin{aligned}[t] \mu_{i,k} &\sim \mathsf{Normal}(0, 10) & k \in \{1, 2, 3\}, \\ \Sigma &= \diag{\sigma} R \diag{sigma}, \\ \sigma &\sim \mathsf{HalfCauchy}(0, 5), \\ R &\sim \mathsf{LkjCorr}(2) . \end{aligned} \]

data_multivarmissing2 <- within(list(), {
  N <- nrow(multivarmissing)
  K <- ncol(multivarmissing)
  mu_loc <- rep(0, 3)
  mu_scale <- rep(0, 10)
  Sigma_scale_scale <- 5
  Sigma_corr_L_eta <- 2
})

mod_multivarmissing2 <- stan_model("stan/multivarmissing2.stan")

mod_multivarmissing2

  data {
  // number of obs
  int N;
  // number of variables
  int K;
  // X
  int N_obs;
  vector[N_obs] X_obs;
  int X_obs_row[N_obs];
  int X_obs_col[N_obs];
  int N_miss;
  int X_miss_row[N_miss];
  int X_miss_col[N_miss];
  // priors
  vector[K] Sigma_scale_scale;
  real Sigma_corr_L_eta;
  vector[K] mu_loc;
  vector[K] mu_scale;
}
parameters {
  vector[K] mu;
  vector[K] Sigma_scale;
  cholesky_factor_corr[K] Sigma_corr_L;
  vector[N_miss] X_miss;
}
transformed parameters {
  vector[K] X[N];
  for (i in 1:N_obs) {
   X[X_obs_row[i], X_obs_col[i]] = X_obs[i];
  }
  for (i in 1:N_miss) {
    X[X_miss_row[i], X_miss_col[i]] = X_miss[i];
  }
}
model {
  Sigma_corr_L ~ lkj_corr_cholesky(Sigma_corr_L_eta);
  Sigma_scale ~ cauchy(0., Sigma_scale_scale);
  for (i in 1:N) {
    X[i] ~ multi_normal_cholesky(mu, Sigma_corr_L);
  }
}

fit_multivarmissing <-
  sampling(mod_multivarmissing2, data = data_multivarmissing2)
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name N_obs is not numeric and not
#> used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name X_obs is not numeric and not
#> used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name X_obs_row is not numeric and
#> not used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name X_obs_col is not numeric and
#> not used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name N_miss is not numeric and not
#> used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name X_miss_row is not numeric and
#> not used
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Warning in FUN(X[[i]], ...): data with name X_miss_col is not numeric and
#> not used
#> failed to create the sampler; sampling not done

fit_multivarmissing
#> Stan model 'multivarmissing2' does not contain samples.

References

Stan Development Team. 2016. Stan Modeling Language Users Guide and Reference Manual, Version 2.14.0. https://github.com/stan-dev/stan/releases/download/v2.14.0/stan-reference-2.14.0.pdf.

This example is derived from Simon Jackman, “Multivariate Missing Data”, 2002-06-18.↩