聚集标准误和缺失值答案

【问题标题】：clustered standard errors and missing values聚集标准误和缺失值
【发布时间】：2021-02-27 04:11:16
【问题描述】：

我有一个关于聚集标准错误和缺失值的问题。特别是，我想知道 R 和 Stata 中协方差矩阵的集群稳健估计器的实现如何处理集群变量具有缺失值但不作为协变量包含在回归模型中的情况。有没有一种方法可以被认为是解决这个问题的最佳实践？

有几种选择：

在拟合模型之前删除集群变量中缺失值的行
在拟合模型后删除聚类变量中缺失值的行，然后在拟合模型后和计算聚类标准误差之前删除缺失聚类信息的行
不要删除集群稳健标准误差，而是为集群变量中的缺失创建一个额外的组（例如，如果一个集群具有两个组 1 和 2，则将所有 NA 设置为 3）

处理此问题的最佳做法是什么？例如，R 的 multiwayvcov 似乎与选项 3 一起使用。

一个简单的例子来澄清我的问题：

library(sandwich)
library(multiwayvcov)
library(lmtest)

data("petersen")
petersen <- petersen[1:200, ]

lm_fit <- lm(y ~ x, data = petersen)
# with multiwayvcov
no_missings_mvcov <- coeftest(lm_fit, cluster.vcov(model = lm_fit, cluster = ~firmid + year))
# with sandwich
no_missings_sw <- coeftest(lm_fit, vcovCL(x = lm_fit,cluster = ~firmid + year ))

petersen[1, "year"] <- NA
petersen[2, "firmid"] <- NA

lm_fit2<- lm(y ~ x, data = petersen)
# with multiwayvcov
missings_mvcov <- coeftest(lm_fit2, cluster.vcov(model = lm_fit2, cluster = ~firmid + year))
# with sandwich
missings_sw <- coeftest(lm_fit2, vcovCL(x = lm_fit2, cluster = ~ firmid + year ))

# Warning messages:
# 1: In rowsum.default(newX[, i], ...) : missing values for 'group'
# 2: In rowsum.default(newX[, i], ...) : missing values for 'group'
# 3: In rowsum.default(newX[, i], ...) : missing values for 'group'
# 4: In rowsum.default(newX[, i], ...) : missing values for 'group'

# compare multiwayvcov
no_missings_mvcov
missings_mvcov
# > no_missings_mvcov
# 
# t test of coefficients:
#   
#   Estimate Std. Error t value  Pr(>|t|)    
# (Intercept) -0.26823    0.34145 -0.7856 0.4330687    
# x            0.91004    0.23183  3.9254 0.0001194 ***
#   ---
#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# > missings_mvcov
# 
# t test of coefficients:
#   
#   Estimate Std. Error t value  Pr(>|t|)    
# (Intercept) -0.26823    0.33627 -0.7977     0.426    
# x            0.91004    0.22839  3.9847 9.493e-05 ***
#   ---
#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


# compare sandwich 
no_missings_sw
missings_sw
# > no_missings_sw
# 
# t test of coefficients:
#   
#   Estimate Std. Error t value  Pr(>|t|)    
# (Intercept) -0.26823    0.34116 -0.7862 0.4326687    
# x            0.91004    0.23146  3.9317 0.0001166 ***
#   ---
#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# > missings_sw
# 
# t test of coefficients:
#   
#   Estimate Std. Error t value  Pr(>|t|)    
# (Intercept) -0.26823    0.33732 -0.7952 0.4274630    
# x            0.91004    0.22963  3.9631 0.0001033 ***
#   ---
#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


# A closer look at preprocessing in multiwayvcov and sandwich

preprocess_clusters_mwvcov <- function(model, cluster, debug = FALSE){
  
  if (inherits(cluster, "formula")) {
    cluster_tmp <- expand.model.frame(model, cluster, na.expand = FALSE)
    cluster <- model.frame(cluster, cluster_tmp, na.action = na.pass)
  }
  else {
    cluster <- as.data.frame(cluster, stringsAsFactors = FALSE)
  }
  
  cluster_dims <- ncol(cluster)
  tcc <- 2^cluster_dims - 1
  acc <- list()
  
  for (i in 1:cluster_dims) {
    acc <- append(acc, combn(1:cluster_dims, i, simplify = FALSE))
  }
  if (debug){print(acc)}
  
  acc <- acc[-1:-cluster_dims]
  
  if(debug){print(acc)}
  
  if (!is.null(model$na.action)) {
    if (class(model$na.action) == "exclude") {
      cluster <- cluster[-model$na.action, ]
    }
    else if (class(model$na.action) == "omit") {
      cluster <- cluster[-model$na.action, ]
    }
    cluster <- as.data.frame(cluster)
  }
  
  if (debug) 
    print(class(cluster))
  i <- !sapply(cluster, is.numeric)
  cluster[i] <- lapply(cluster[i], as.character)
  if (cluster_dims > 1) {
    for (i in acc) {
      cluster <- cbind(cluster, Reduce(paste0, cluster[, 
                                                       i]))
    }
  }
  cluster
}

# > head(preprocess_clusters_mwvcov(lm_fit, ~firmid + year))
# firmid year Reduce(paste0, cluster[, i])
# 1      1   NA                          1NA
# 2     NA    2                          NA2
# 3      1    3                           13
# 4      1    4                           14
# 5      1    5                           15
# 6      1    6                           16

# > sapply(preprocess_clusters_mwvcov(lm_fit, ~firmid + year), class)
# firmid                         year Reduce(paste0, cluster[, i]) 
# "integer"                    "integer"                     "factor" 


# NA handling in sandwich

preprocess_cluster_sandwich <- function(x, cluster, ...){
  
  if (is.list(x) && !is.null(x$na.action)) class(x$na.action) <- "omit"
  ef <- estfun(x, ...)
  k <- NCOL(ef)
  n <- NROW(ef)
  
  ## set up return value with correct dimension and names
  rval <- matrix(0, nrow = k, ncol = k,
                 dimnames = list(colnames(ef), colnames(ef)))
  
  ## cluster can either be supplied explicitly or
  ## be an attribute of the model...FIXME: other specifications?
  if (is.null(cluster)) cluster <- attr(x, "cluster")
  
  ## resort to cross-section if no clusters are supplied
  if (is.null(cluster)) cluster <- 1L:n
  
  ## collect 'cluster' variables in a data frame
  if(inherits(cluster, "formula")) {
    cluster_tmp <- if("Formula" %in% loadedNamespaces()) { ## FIXME to suppress potential warnings due to | in Formula
      suppressWarnings(expand.model.frame(x, cluster, na.expand = FALSE))
    } else {
      expand.model.frame(x, cluster, na.expand = FALSE)
    }
    cluster <- model.frame(cluster, cluster_tmp, na.action = na.pass)
  } else {
    cluster <- as.data.frame(cluster)
  }
  
  ## handle omitted or excluded observations
  if((n != NROW(cluster)) && !is.null(x$na.action) && (class(x$na.action) %in% c("exclude", "omit"))) {
    cluster <- cluster[-x$na.action, , drop = FALSE]
  }
  
  if(NROW(cluster) != n) stop("number of observations in 'cluster' and 'estfun()' do not match")
  
  return(cluster)    
  
}

head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year))
# > head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year))
# firmid year
# 1      1   NA
# 2     NA    2
# 3      1    3
# 4      1    4
# 5      1    5
# 6      1    6
sapply(head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year)), class)
# > sapply(head(preprocess_cluster_sandwich(lm_fit2, cluster = ~ firmid + year)), class)
# firmid      year 
# "integer" "integer"

如您所见，lm_fit 和lm_fit1 的标准误差不同，而点估计值相同。请注意，“三明治”会返回一条错误消息，原因是聚类变量中缺少值。函数preprocess_clusters_mwvcov 现在收集multiwayvcov 包的集群预处理。看起来multiwayvcov 在模型拟合后最终会丢弃这些缺失的集群（选项 2）。这与 reghdfe 形成对比，根据 Arthur 的回答，后者通过在拟合模型之前删除所有缺少集群变量的观测值来处理集群中的缺失值（选项 1）。

R 的 sandwich 包中的文档指出“如果模型 x 中的观察数量由于 NA 处理而小于原始数据中的观察数，则可以在必要时将相同的 NA 处理应用于集群（并且 x $na.action 可用）”，但没有说明如果集群变量中的观察数量小于模型 x 中的观察数量会发生什么。

【问题讨论】：

标签： r stata missing-data standard-error

【解决方案1】：

您的编辑让您看起来对sandwich 的工作方式更感兴趣。我会留下我的reghdfe 答案以供参考。

有没有一种方法可以被认为是解决这个问题的最佳实践？

如果你问的是在 Stata 中实现集群的包，那么答案是肯定的，它可能是 reghdfe。使用ssc install reghdfe 安装。

请参阅linked documentation，以及"Singletons, Cluster-Robust Standard Errors and Fixed Effects: A Bad Mix" 和"Linear Models with High-Dimensional Fixed Effects: An Efficient and Feasible Estimator"。

reghdfe 采用第一种方法，尽管您可以通过将组分配给非缺失值（即replace groupid = 9999 if mi(groupid)）来显式实现 (3)

以下验证 reghdfe 删除丢失的组：

sysuse auto
count // there are 74 obs
count if !mi(rep78) // five are missing the cluster var
local notMissClusterVar = `r(N)'  
reghdfe price weight length, noabsorb cluster(rep78) 
assert `e(N)' == `notMissCluster'

标准的regress 命令也是如此。

sysuse auto
count // there are 74 obs
count if !mi(rep78) // five are missing the cluster var
local notMissClusterVar = `r(N)'  
regress price weight length, vce(cluster rep78) 
assert `e(N)' == `notMissCluster'

处理这个问题的最佳实践是什么？

如果您要问是否应该将缺少您想要聚类的变量的观察视为自己的聚类或删除它们，那么我认为默认的答案应该是删除它们。尽管这当然取决于您的数据。请记住，聚类标准误差三明治估计器的假设是无限组，组内观察有限（请参阅Cameron & Trivedi (2005) p. 706-707。因此，当组数很大且缺少组信息的观察数很少时，方法 3 似乎问题最少. 但是，在很多情况下，不知道该组本身可能会被取消资格。

在您的示例中，您使用来自Petersen (2009) 的数据。这是一组公司，他最终建议按个人（公司）和时间段进行聚类。似乎很难证明包含来自未知时期或公司的观察结果是合理的，即使您不想聚集在公司标识符上。

当然，您可能会遇到这样一种情况，即推断缺失的观察来自同一组是有意义的。在这种情况下，这是您必须提出的论点。

【讨论】：

请注意，assert 语句将在具有单例的数据集中失败，因为 reghdfe 将删除它们。

【解决方案2】：

策略

我不知道有任何正式的证明，但我的感觉是策略 1 是总体上唯一合理的方法。至少当观察结果随机缺失时，忽略缺失应该不会出错（除了可能会损失一点效率）。

策略 2 似乎具有潜在危险，尤其是在存在许多缺失值的情况下。如果协方差矩阵仅根据完整数据集上模型的分数/残差的子集计算时保持一致，我会感到惊讶。

策略 3 可能没问题，但可能取决于特定应用程序是否在自己的集群中使用 NA 收集观察结果。因此，默认情况下我不会使用此策略。

R 实现

在软件方面：原则上，实现策略 1 很简单。但是，在 R 包sandwich 的特定情况下，它不是。这是由于封装的模块化设计：模型在调用vcovCL() 之前由用户安装，并且仅在meatCL() 中检测到问题，而bread() 提取器不受NA 影响。因此，我们决定抛出一个明确的错误消息，指示用户注意这一点。

插图

在你的例子中：

coeftest(lm_fit2, vcov = vcovCL, cluster = ~ firmid + year)
## Error in meatCL(x, cluster = cluster, type = type, ...) : 
##   cannot handle NAs in 'cluster': either refit the model without
##   the NA observations in 'cluster' or impute the NAs

因此，你应该这样做：

lm_fit3 <- lm(y ~ x, data = petersen, subset = !is.na(year) & !is.na(firmid))
coeftest(lm_fit3, vcov = vcovCL, cluster = ~ firmid + year)
## t test of coefficients:
## 
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) -0.30132    0.33879 -0.8894    0.3749    
## x            0.93566    0.23158  4.0404 7.659e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

可用性

从sandwich 3.0-1 开始提供这种改进的错误处理。在撰写本文时，这是当前的开发版本，可从 R-Forge 获得： install.packages("sandwich", repos="https://R-Forge.R-project.org")。另见：https://sandwich.R-Forge.R-project.org/news/。

【讨论】：