Correction to Measurement Error
Mark Lai
2024-09-06
correction-error.Rmd
When computing the reliability or the standard errors of factor scores, we typically condition on the estimates of the model parameters, aka assuming that they are known. However, when sample size is small, ignoring those can be problematic as the model parameters are imprecise. This vignette provides details on how to obtain more realistic estimates of standard errors of factor scores.
Unidimensional Case
Consider the conditional variance of the factor scores, , with . From the factor model, we have
The second term is the error component. Here is a small simulation:
#' Load package and set seed
library(lavaan)
#> This is lavaan 0.6-18
#> lavaan is FREE software! Please report any bugs.
library(R2spa)
set.seed(191254)
#' Fixed conditions
num_obs <- 100
lambda <- c(.3, .5, .7, .9)
theta <- c(.5, .4, .5, .7)
#' Simulate true score, and standardized
eta <- rnorm(num_obs)
eta <- (eta - mean(eta))
eta <- eta / sqrt(mean(eta^2))
#' Function for simulating y
simy <- function(eta, lambda) {
t(tcrossprod(lambda, eta) +
rnorm(length(lambda) * length(eta), sd = sqrt(theta)))
}
#' Simulation
nsim <- 2500
tfsy_sim <- fsy_sim <- matrix(NA, nrow = num_obs, ncol = nsim)
# Also save the scoring matrix
a_sim <- matrix(NA, nrow = length(lambda), ncol = nsim)
for (i in seq_len(nsim)) {
y <- simy(eta = eta, lambda = lambda)
tfsy_sim[, i] <- R2spa::compute_fscore(
y, lambda = lambda, theta = diag(theta), psi = matrix(1)
)
fsy <- R2spa::get_fs(y, std.lv = TRUE)
fsy_sim[, i] <- fsy[, 1]
a_sim[, i] <- attr(fsy, which = "scoring_matrix")
}
#' Average conditional variance
# a known
apply(tfsy_sim, 1, var) |> mean()
#> [1] 0.1871828
# compare to theoretical value
true_a <- R2spa:::compute_a_reg(lambda, psi = matrix(1), theta = diag(theta))
true_a %*% diag(theta) %*% t(true_a)
#> [,1]
#> [1,] 0.1893211
# a estimated
apply(fsy_sim, 1, var) |> mean()
#> [1] 0.2078126
The standard errors are larger when using sample estimates of weights.
When is known, the error variance is simply . However, when is unknown and estimated as from the data,
Under the assumption of = , the second term is 0. The first term is an expectation of a quadratic form,
where is the covariance matrix of . We can see this using the simulated data
correct_fac <- sum(diag(diag(theta) %*% cov(t(a_sim))))
true_a %*% diag(theta) %*% t(true_a) + correct_fac
#> [,1]
#> [1,] 0.2100938
Correction Factor
We can approximate using the delta method, and the other terms in the above using sample estimates
The delta method estimate, , is
where
is the Jacobian matrix, and
is the sampling variance of the sample estimator of
.
The Jacobian can be approximated using finite difference with
lavaan::lav_func_jacobian_complex()
.
y <- simy(eta = eta, lambda = lambda)
cfa_fit <- cfa("f =~ y1 + y2 + y3 + y4",
data = setNames(data.frame(y), paste0("y", 1:4)),
std.lv = TRUE)
# Jacobian
R2spa:::compute_a(coef(cfa_fit), lavobj = cfa_fit)
#> [[1]]
#> [,1] [,2] [,3] [,4]
#> f 0.09890619 0.3246767 0.450034 0.3134454
jac_a <- lavaan::lav_func_jacobian_complex(
\(x) R2spa:::compute_a(x, lavobj = cfa_fit)[[1]],
coef(cfa_fit)
)
sum(diag(lavInspect(cfa_fit, what = "est")$theta %*%
jac_a %*% vcov(cfa_fit) %*% t(jac_a)))
#> [1] 0.02152978
fsy <- R2spa::get_fs(y, std.lv = TRUE)
attr(fsy, which = "fsT")
#> fs_f1
#> fs_f1 0.2014365
# Using `get_fs(..., corrected_fsT = TRUE)
fsy2 <- R2spa::get_fs(y, std.lv = TRUE, corrected_fsT = TRUE)
attr(fsy2, which = "fsT")
#> fs_f1
#> fs_f1 0.2229663
Multidimensional Case
#' Set seed
set.seed(251329)
#' Fixed conditions
num_obs <- 100
lambda <- Matrix::bdiag(list(c(.3, .5, .7, .9), c(.7, .6, .7))) |>
as.matrix()
theta <- c(.5, .4, .5, .7, .7, .8, .5)
psi <- matrix(c(1, -.4, -.4, 1), nrow = 2)
#' Simulate true score (exact means and covariances)
eta <- MASS::mvrnorm(num_obs, mu = rep(0, 2), Sigma = psi, empirical = TRUE)
#' Function for simulating y
simy <- function(eta, lambda) {
tcrossprod(eta, lambda) +
MASS::mvrnorm(num_obs, mu = rep(0, length(theta)), Sigma = diag(theta))
}
#' Simulation
nsim <- 2500
tfsy_sim <- fsy_sim <- array(NA, dim = c(num_obs, ncol(lambda), nsim))
# Also save the scoring matrix
a_sim <- array(NA, dim = c(ncol(lambda), nrow(lambda), nsim))
for (i in seq_len(nsim)) {
y <- simy(eta = eta, lambda = lambda)
tfsy_sim[, , i] <- R2spa::compute_fscore(
y, lambda = lambda, theta = diag(theta), psi = psi
)
fsy <- R2spa::get_fs(
data.frame(y) |> setNames(paste0("y", 1:7)),
model = "f1 =~ y1 + y2 + y3 + y4\nf2 =~ y5 + y6 + y7",
std.lv = TRUE)
fsy_sim[, , i] <- as.matrix(fsy[, 1:2])
a_sim[, , i] <- attr(fsy, which = "scoring_matrix")
}
#' Average conditional variance
# a known
apply(tfsy_sim, MARGIN = 1, FUN = \(x) cov(t(x))) |>
rowMeans() |> matrix(ncol = 2, nrow = 2)
#> [,1] [,2]
#> [1,] 0.1789934 -0.0483855
#> [2,] -0.0483855 0.2022312
# compare to theoretical value
true_a <- R2spa:::compute_a_reg(lambda, psi = psi, theta = diag(theta))
true_a %*% diag(theta) %*% t(true_a)
#> [,1] [,2]
#> [1,] 0.18076106 -0.04855749
#> [2,] -0.04855749 0.20339709
# a estimated
apply(fsy_sim, MARGIN = 1, FUN = \(x) cov(t(x))) |>
rowMeans() |> matrix(ncol = 2, nrow = 2)
#> [,1] [,2]
#> [1,] 0.20175239 -0.05254471
#> [2,] -0.05254471 0.23427145
correct_fac11 <- sum(diag(diag(theta) %*% cov(t(a_sim[1, , ]))))
correct_fac22 <- sum(diag(diag(theta) %*% cov(t(a_sim[2, , ]))))
correct_fac21 <- sum(diag(diag(theta) %*%
cov(t(a_sim[2, , ]), t(a_sim[1, , ]))))
correct_fac <- matrix(c(correct_fac11, correct_fac21,
correct_fac21, correct_fac22), nrow = 2)
true_a %*% diag(theta) %*% t(true_a) + correct_fac
#> [,1] [,2]
#> [1,] 0.20114155 -0.05375924
#> [2,] -0.05375924 0.23429418
The , element of the correction matrix can be approximated by , with obtained by
y <- simy(eta = eta, lambda = lambda)
cfa_fit <- cfa("f1 =~ y1 + y2 + y3 + y4\nf2 =~ y5 + y6 + y7",
data = setNames(data.frame(y), paste0("y", 1:7)),
std.lv = TRUE)
# Jacobian
R2spa:::compute_a(coef(cfa_fit), lavobj = cfa_fit)
#> [[1]]
#> [,1] [,2] [,3] [,4] [,5] [,6]
#> f1 0.06546889 0.22696068 0.38977223 0.33101521 -0.05012587 -0.02226577
#> f2 -0.01227313 -0.04254719 -0.07306865 -0.06205377 0.37397828 0.16612010
#> [,7]
#> f1 -0.05314801
#> f2 0.39652577
jac_a1 <- lavaan::lav_func_jacobian_complex(
\(x) R2spa:::compute_a(x, lavobj = cfa_fit)[[1]][1, ],
coef(cfa_fit)
)
jac_a2 <- lavaan::lav_func_jacobian_complex(
\(x) R2spa:::compute_a(x, lavobj = cfa_fit)[[1]][2, ],
coef(cfa_fit)
)
# Correction[1, 1]
sum(diag(lavInspect(cfa_fit, what = "est")$theta %*%
jac_a1 %*% vcov(cfa_fit) %*% t(jac_a1)))
#> [1] 0.01551285
# Correction[2, 2]
sum(diag(lavInspect(cfa_fit, what = "est")$theta %*%
jac_a2 %*% vcov(cfa_fit) %*% t(jac_a2)))
#> [1] 0.02207038
# Correction[2, 1]
sum(diag(lavInspect(cfa_fit, what = "est")$theta %*%
jac_a2 %*% vcov(cfa_fit) %*% t(jac_a1)))
#> [1] -0.004873356
fsy <- R2spa::get_fs(
data.frame(y) |> setNames(paste0("y", 1:7)),
model = "f1 =~ y1 + y2 + y3 + y4\nf2 =~ y5 + y6 + y7",
std.lv = TRUE)
attr(fsy, which = "fsT")
#> fs_f1 fs_f2
#> fs_f1 0.16691097 -0.05653148
#> fs_f2 -0.05653148 0.19891917
# Using `get_fs(..., corrected_fsT = TRUE)
fsy2 <- R2spa::get_fs(
data.frame(y) |> setNames(paste0("y", 1:7)),
model = "f1 =~ y1 + y2 + y3 + y4\nf2 =~ y5 + y6 + y7",
std.lv = TRUE,
corrected_fsT = TRUE)
attr(fsy2, which = "fsT")
#> fs_f1 fs_f2
#> fs_f1 0.18242382 -0.06140484
#> fs_f2 -0.06140484 0.22098955