【发布时间】:2021-02-27 04:11:16
【问题描述】:
我有一个关于聚集标准错误和缺失值的问题。特别是,我想知道 R 和 Stata 中协方差矩阵的集群稳健估计器的实现如何处理集群变量具有缺失值但不作为协变量包含在回归模型中的情况。有没有一种方法可以被认为是解决这个问题的最佳实践?
有几种选择:
- 在拟合模型之前删除集群变量中缺失值的行
- 在拟合模型后删除聚类变量中缺失值的行,然后在拟合模型后和计算聚类标准误差之前删除缺失聚类信息的行
- 不要删除集群稳健标准误差,而是为集群变量中的缺失创建一个额外的组(例如,如果一个集群具有两个组 1 和 2,则将所有 NA 设置为 3)
处理此问题的最佳做法是什么?例如,R 的 multiwayvcov 似乎与选项 3 一起使用。
一个简单的例子来澄清我的问题:
library(sandwich)
library(multiwayvcov)
library(lmtest)
data("petersen")
petersen <- petersen[1:200, ]
lm_fit <- lm(y ~ x, data = petersen)
# with multiwayvcov
no_missings_mvcov <- coeftest(lm_fit, cluster.vcov(model = lm_fit, cluster = ~firmid + year))
# with sandwich
no_missings_sw <- coeftest(lm_fit, vcovCL(x = lm_fit,cluster = ~firmid + year ))
petersen[1, "year"] <- NA
petersen[2, "firmid"] <- NA
lm_fit2<- lm(y ~ x, data = petersen)
# with multiwayvcov
missings_mvcov <- coeftest(lm_fit2, cluster.vcov(model = lm_fit2, cluster = ~firmid + year))
# with sandwich
missings_sw <- coeftest(lm_fit2, vcovCL(x = lm_fit2, cluster = ~ firmid + year ))
# Warning messages:
# 1: In rowsum.default(newX[, i], ...) : missing values for 'group'
# 2: In rowsum.default(newX[, i], ...) : missing values for 'group'
# 3: In rowsum.default(newX[, i], ...) : missing values for 'group'
# 4: In rowsum.default(newX[, i], ...) : missing values for 'group'
# compare multiwayvcov
no_missings_mvcov
missings_mvcov
# > no_missings_mvcov
#
# t test of coefficients:
#
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -0.26823 0.34145 -0.7856 0.4330687
# x 0.91004 0.23183 3.9254 0.0001194 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# > missings_mvcov
#
# t test of coefficients:
#
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -0.26823 0.33627 -0.7977 0.426
# x 0.91004 0.22839 3.9847 9.493e-05 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# compare sandwich
no_missings_sw
missings_sw
# > no_missings_sw
#
# t test of coefficients:
#
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -0.26823 0.34116 -0.7862 0.4326687
# x 0.91004 0.23146 3.9317 0.0001166 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# > missings_sw
#
# t test of coefficients:
#
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -0.26823 0.33732 -0.7952 0.4274630
# x 0.91004 0.22963 3.9631 0.0001033 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# A closer look at preprocessing in multiwayvcov and sandwich
preprocess_clusters_mwvcov <- function(model, cluster, debug = FALSE){
if (inherits(cluster, "formula")) {
cluster_tmp <- expand.model.frame(model, cluster, na.expand = FALSE)
cluster <- model.frame(cluster, cluster_tmp, na.action = na.pass)
}
else {
cluster <- as.data.frame(cluster, stringsAsFactors = FALSE)
}
cluster_dims <- ncol(cluster)
tcc <- 2^cluster_dims - 1
acc <- list()
for (i in 1:cluster_dims) {
acc <- append(acc, combn(1:cluster_dims, i, simplify = FALSE))
}
if (debug){print(acc)}
acc <- acc[-1:-cluster_dims]
if(debug){print(acc)}
if (!is.null(model$na.action)) {
if (class(model$na.action) == "exclude") {
cluster <- cluster[-model$na.action, ]
}
else if (class(model$na.action) == "omit") {
cluster <- cluster[-model$na.action, ]
}
cluster <- as.data.frame(cluster)
}
if (debug)
print(class(cluster))
i <- !sapply(cluster, is.numeric)
cluster[i] <- lapply(cluster[i], as.character)
if (cluster_dims > 1) {
for (i in acc) {
cluster <- cbind(cluster, Reduce(paste0, cluster[,
i]))
}
}
cluster
}
# > head(preprocess_clusters_mwvcov(lm_fit, ~firmid + year))
# firmid year Reduce(paste0, cluster[, i])
# 1 1 NA 1NA
# 2 NA 2 NA2
# 3 1 3 13
# 4 1 4 14
# 5 1 5 15
# 6 1 6 16
# > sapply(preprocess_clusters_mwvcov(lm_fit, ~firmid + year), class)
# firmid year Reduce(paste0, cluster[, i])
# "integer" "integer" "factor"
# NA handling in sandwich
preprocess_cluster_sandwich <- function(x, cluster, ...){
if (is.list(x) && !is.null(x$na.action)) class(x$na.action) <- "omit"
ef <- estfun(x, ...)
k <- NCOL(ef)
n <- NROW(ef)
## set up return value with correct dimension and names
rval <- matrix(0, nrow = k, ncol = k,
dimnames = list(colnames(ef), colnames(ef)))
## cluster can either be supplied explicitly or
## be an attribute of the model...FIXME: other specifications?
if (is.null(cluster)) cluster <- attr(x, "cluster")
## resort to cross-section if no clusters are supplied
if (is.null(cluster)) cluster <- 1L:n
## collect 'cluster' variables in a data frame
if(inherits(cluster, "formula")) {
cluster_tmp <- if("Formula" %in% loadedNamespaces()) { ## FIXME to suppress potential warnings due to | in Formula
suppressWarnings(expand.model.frame(x, cluster, na.expand = FALSE))
} else {
expand.model.frame(x, cluster, na.expand = FALSE)
}
cluster <- model.frame(cluster, cluster_tmp, na.action = na.pass)
} else {
cluster <- as.data.frame(cluster)
}
## handle omitted or excluded observations
if((n != NROW(cluster)) && !is.null(x$na.action) && (class(x$na.action) %in% c("exclude", "omit"))) {
cluster <- cluster[-x$na.action, , drop = FALSE]
}
if(NROW(cluster) != n) stop("number of observations in 'cluster' and 'estfun()' do not match")
return(cluster)
}
head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year))
# > head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year))
# firmid year
# 1 1 NA
# 2 NA 2
# 3 1 3
# 4 1 4
# 5 1 5
# 6 1 6
sapply(head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year)), class)
# > sapply(head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year)), class)
# firmid year
# "integer" "integer"
如您所见,lm_fit 和lm_fit1 的标准误差不同,而点估计值相同。请注意,“三明治”会返回一条错误消息,原因是聚类变量中缺少值。
函数preprocess_clusters_mwvcov 现在收集multiwayvcov 包的集群预处理。看起来multiwayvcov 在模型拟合后最终会丢弃这些缺失的集群(选项 2)。这与 reghdfe 形成对比,根据 Arthur 的回答,后者通过在拟合模型之前删除所有缺少集群变量的观测值来处理集群中的缺失值(选项 1)。
R 的 sandwich 包中的文档指出“如果模型 x 中的观察数量由于 NA 处理而小于原始数据中的观察数,则可以在必要时将相同的 NA 处理应用于集群(并且 x $na.action 可用)”,但没有说明如果集群变量中的观察数量小于模型 x 中的观察数量会发生什么。
【问题讨论】:
标签: r stata missing-data standard-error