【发布时间】:2021-02-02 12:53:24
【问题描述】:
我正在添加模型系数的方差,然后返回总和的平均值。
我只是想检查哪种回归方法对异常值更稳健。我会研究很多场景。
但是,我的代码告诉我普通最小二乘是最好的,但这不是预期的结果,因为 MM 估计和 Huber 被称为稳健回归方法。
我的代码有问题吗?
#####################################
rmn <- function(n, mu) {
p <- length(mu)
matrix(rnorm(n*p, mean = mu), ncol = p)
}
#####################################
RI<-function(y,x,a,mu,R=30,t=1000){
x <- as.matrix(x)
dm <- dim(x)
n <- dm[1]
bias1 <- bias2 <- bias3 <- numeric(t)
b1 <- b2<- b3 <- numeric(R)
### Outliers in X ######
for (j in 1:t) {
for (i in 1:R) {
id <- sample(n, a * n)
z <- x
z[id, ] <- rmn(length(id), mu)
b1[i] <- var(coef(lm(y ~., data = data.frame(z))))
b2[i] <- var(coef(rlm(y ~ ., data = data.frame(z), maxit = 2000, method = "MM")))
b3[i] <- var(coef(rlm(y ~ ., data = data.frame(z), psi = psi.huber,maxit = 300)))
}
bias1[j] <- sum(b1); bias2[j] <- sum(b2); bias3[j] <- sum(b3)
}
bias <- cbind("lm" = bias1,"MM-rlm" = bias2, "H-rlm" = bias3)
colMeans(bias)
}
#####################################
p <- 5
n <- 300
x<- matrix(rnorm(n * p), ncol = p)
y<-rnorm(n)
a=0.2
mu <-colMeans(x)+10
#####################################
RI(y,x,a,mu)
#####################################
更新 由于第一个提供的答案,我改变了衡量鲁棒性的想法。
我通过计算数据未受污染和数据受污染时的系数之间的平均绝对差来衡量稳健性。我首先在 y 中引入异常值,然后在 x 中引入异常值。我还是有问题。
############ R CODE ##############
rmn <- function(n, mu, seed = TRUE) {
if (seed) set.seed(12345)
p <- length(mu)
matrix( rnorm(n * p, mean = mu), ncol = p)
}
##################################
out.cv <- function(y, x, a, mu, R = 500, seed = TRUE) {
## y: response variable
## x: independent variables
## a: percent of outliers
## mu: how far should the outliers be. A vector if outliers in x,
## or a single number if outliers in y
## R: how many times to repeat this process
x <- as.matrix(x)
dm <- dim(x)
n <- dm[1] ; d <- dm[2] + 1
b1 <- b2<- b3 <- numeric(R)
be <- coef( lm(y ~., data = as.data.frame(x[,-1]) ) )
####################################
### Outliers in Y ######
if ( length(mu) == 1 ) {
for (i in 1:R) {
if (seed) set.seed(12345)
id <- sample(n, a * n)
z <- y
if (seed) set.seed(12345)
z[id] <- rnorm(id, mu) ## mu has to be a single number here
## mean absolute difference between coefficients of clean data
## and coefficients with contaminated data
b1[i] <- mean( abs( coef( lm(z ~., data = as.data.frame(x[,-1])) ) - be) )
b2[i] <- mean( abs( coef( rlm(z ~ ., data = data.frame(x[,-1]), maxit = 2000, method = "MM") ) - be ) )
b3[i] <- mean( abs( coef( rlm(z ~ ., data = data.frame(x[,-1]), psi = psi.huber,maxit = 300) ) - be ) )
}
########################
##### Outliers in X #########
} else {
for (i in 1:R) {
if (seed) set.seed(12345)
id <- sample(n, a * n)
z <- x
z[id, ] <- rmn( length(id), mu, seed ) ## mu must be a vector
b1[i] <- mean( abs( coef( lm(y ~., data = as.data.frame(z[,-1])) )- be) )
b2[i] <- mean( abs( coef( rlm(y ~ ., data = data.frame(z[,-1]), maxit = 2000, method = "MM") ) - be ) )
b3[i] <- mean( abs( coef( rlm(y ~ ., data = data.frame(z[,-1]), psi = psi.huber,maxit = 300) ) - be ) )
}
}
bias1 <- mean(b1) ; bias2 <- mean(b2); bias3 <- mean(b3)
bias <- c(bias1, bias2, bias3)
names(bias) <- c("lm", "MM-rlm","Huber-rlm")
bias
}
################################
p <- 5
n <- 200
##############################
# Independent X and Y ####
#set.seed(12345)
#x<- matrix( rnorm(n * p), ncol = p)
#y<-rnorm(n)
## Related X and Y ####
set.seed(12345)
x <- rmn(n, numeric(p))
ber <- rnorm(p)
m <- x %*% ber
y <- rnorm(n, m, 1)
############################
a <- 0.2 #outliers 10%
mu <- 15 ## outliers in y
out.cv(y, x, a, mu)
###########################
mu <-colMeans(x)+15 ## outliers in x
out.cv(y, x, a, mu)
###################
【问题讨论】:
-
我不明白你的统计问题。 “X中的异常值”到底是什么意思?有影响力的价值观?
-
@Roland,是的,我会在 X 中创建异常值,在另一种情况下使用 y 生成异常值。我想看看回归方法如何受到这些场景的影响。
标签: r regression linear-regression