【问题标题】:Rolling regression over multiple columns在多列上滚动回归
【发布时间】:2023-04-11 07:45:01
【问题描述】:

我在寻找最有效的方法来计算对具有多列的 xts 对象的滚动线性回归时遇到问题。我已经在 stackoverflow 上搜索并阅读了几个以前的问题。

这个question and answer 接近但在我看来还不够,因为我想计算多个回归,而因变量在所有回归中都保持不变。我试图用随机数据重现一个例子:

require(xts)
require(RcppArmadillo)  # Load libraries

data <- matrix(sample(1:10000, 1500), 1500, 5, byrow = TRUE)  # Random data
data[1000:1500, 2] <- NA  # insert NAs to make it more similar to true data
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))

NR <- nrow(data)  # number of observations
NC <- ncol(data)  # number of factors
obs <- 30  # required number of observations for rolling regression analysis
info.names <- c("res", "coef")

info <- array(NA, dim = c(NR, length(info.names), NC))
colnames(info) <- info.names

创建数组是为了存储多个变量(残差、系数等)随时间和每个因子的变化。

loop.begin.time <- Sys.time()

for (j in 2:NC) {
  cat(paste("Processing residuals for factor:", j), "\n")
  for (i in obs:NR) {
    regression.temp <- fastLm(data[i:(i-(obs-1)), j] ~ data[i:(i-(obs-1)), 1])
    residuals.temp <- regression.temp$residuals
    info[i, "res", j] <- round(residuals.temp[1] / sd(residuals.temp), 4)
    info[i, "coef", j] <- regression.temp$coefficients[2]
  } 
}

loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)  # prints the loop runtime

正如循环所示,这个想法是每次针对其他因素之一运行 30 个观察值滚动回归,并将 data[, 1] 作为因变量(因子)。我必须将 30 个残差存储在一个临时对象中以便将它们标准化,因为 fastLm 不计算标准化残差。

如果 xts 对象中的列数(因子)增加到大约 100 到 1,000 列,那么循环会非常缓慢并且会变得很麻烦。我希望有一个更有效的代码来在大型数据集上创建滚动回归。

【问题讨论】:

  • 您可以通过不运行两次回归来使其速度提高 2 倍...我已将其编辑到您的问题中。
  • 当然可以!在欧洲这里已经很晚了。谢谢约书亚。这一变化将性能提高了 2-2.5 倍。但是,您是否认为此代码对于包含 2500 次每日观察和大约 1,000 个因子的数据集具有足够的性能?或者您是否知道与上述方法相比,使用 rollapply 可以提高性能?我猜如果数据集变得非常大,您必须应用递归最小二乘过滤器或相关的东西 - 对此有什么想法吗?

标签: r apply xts linear-regression rolling-computation


【解决方案1】:

这是使用rollRegres 包更快的方法

library(xts)
library(RcppArmadillo)

#####
# simulate data
set.seed(50554709)
data <- matrix(sample(1:10000, 1500), 1500, 5, byrow = TRUE)  # Random data
# data[1000:1500, 2] <- NA # only focus on the parts that are computed
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))

#####
# setup for solution in OP
NR <- nrow(data)
NC <- ncol(data)
obs <- 30L
info.names <- c("res", "coef")

info <- array(NA, dim = c(NR, length(info.names), NC))
colnames(info) <- info.names

#####
# solve with rollRegres
library(rollRegres)

loop.begin.time <- Sys.time()

X <- cbind(1, drop(data[, 1]))
out <- lapply(2:NC, function(j){
  fit <- roll_regres.fit(
    y = data[, j], x = X, width = obs, do_compute = c("sigmas"))

  # are you sure you want the residual of the first and not the last
  # observation in each window?
  idx <- 1:(nrow(data) - obs + 1L)
  idx_tail <- idx + obs - 1L
  resids <- c(rep(NA_real_, obs - 1L),
                  data[idx, j] - rowSums(fit$coefs[idx_tail, ] * X[idx, ]))

  # the package uses the unbaised estimator so we have to time by this factor
  # to get the same
  sds <-  fit$sigmas * sqrt((obs - 2L) / (obs - 1L))

  unclass(cbind(coef = fit$coefs[, 2L], res = drop(round(resids / sds, 4))))
})

loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)
#R Time difference of 0.03123808 secs

#####
# solve with original method
loop.begin.time <- Sys.time()

for (j in 2:NC) {
  cat(paste("Processing residuals for factor:", j), "\n")
  for (i in obs:NR) {
    regression.temp <- fastLm(data[i:(i-(obs-1)), j] ~ data[i:(i-(obs-1)), 1])
    residuals.temp <- regression.temp$residuals
    info[i, "res", j] <- round(residuals.temp[1] / sd(residuals.temp), 4)
    info[i, "coef", j] <- regression.temp$coefficients[2]
  }
}
#R Processing residuals for factor: 2
#R Processing residuals for factor: 3
#R Processing residuals for factor: 4
#R Processing residuals for factor: 5

loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)  # prints the loop runtime
#R Time difference of 7.554767 secs

#####
# check that results are the same
all.equal(info[, "coef", 2L], out[[1]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 2L], out[[1]][, "res"])
#R [1] TRUE

all.equal(info[, "coef", 3L], out[[2]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 3L], out[[2]][, "res"])
#R [1] TRUE

all.equal(info[, "coef", 4L], out[[3]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 4L], out[[3]][, "res"])
#R [1] TRUE

all.equal(info[, "coef", 5L], out[[4]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 5L], out[[4]][, "res"])
#R [1] TRUE

请注意上述解决方案中的此注释

# are you sure you want the residual of the first and not the last
# observation in each window?

这是与Sameer's answer的比较

library(rollRegres)
require(xts)

data <- matrix(sample(1:10000, 1500000, replace=T), 1500, 1000, byrow = TRUE)  # Random data
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))

NR <- nrow(data)  # number of observations
NC <- ncol(data)  # number of factors
obs <- 30  # required number of observations for rolling regression analysis

loop.begin.time <- Sys.time()

X <- cbind(1, drop(data[, 1]))
out <- lapply(2:NC, function(j){
  fit <- roll_regres.fit(
    y = data[, j], x = X, width = obs, do_compute = c("sigmas"))

  # are you sure you want the residual of the first and not the last
  # observation in each window?
  idx <- 1:(nrow(data) - obs + 1L)
  idx_tail <- idx + obs - 1L
  resids <- c(rep(NA_real_, obs - 1L),
              data[idx, j] - rowSums(fit$coefs[idx_tail, ] * X[idx, ]))

  # the package uses the unbaised estimator so we have to time by this factor
  # to get the same
  sds <-  fit$sigmas * sqrt((obs - 2L) / (obs - 1L))

  unclass(cbind(coef = fit$coefs[, 2L], res = drop(round(resids / sds, 4))))
})

loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)
#R Time difference of 0.9019711 secs

时间包括用于计算标准化残差的时间。

【讨论】:

    【解决方案2】:

    如果您深入到线性回归的数学级别,应该会很快。如果 X 是自变量,Y 是因变量。系数由

    给出

    Beta = inv(t(X) %*% X) %*% (t(X) %*% Y)

    对于您希望哪个变量成为依赖变量以及哪个变量是独立变量,我有点困惑,但希望解决下面的类似问题也能对您有所帮助。

    在下面的示例中,我采用 1000 个变量而不是原来的 5 个变量,并且不引入任何 NA。

    require(xts)
    
    data <- matrix(sample(1:10000, 1500000, replace=T), 1500, 1000, byrow = TRUE)  # Random data
    data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))
    
    NR <- nrow(data)  # number of observations
    NC <- ncol(data)  # number of factors
    obs <- 30  # required number of observations for rolling regression analysis
    

    现在我们可以使用 Joshua 的 TTR 包计算系数。

    library(TTR)
    
    loop.begin.time <- Sys.time()
    
    in.dep.var <- data[,1]
    xx <- TTR::runSum(in.dep.var*in.dep.var, obs)
    coeffs <- do.call(cbind, lapply(data, function(z) {
        xy <- TTR::runSum(z * in.dep.var, obs)
        xy/xx
    }))
    
    loop.end.time <- Sys.time()
    
    print(loop.end.time - loop.begin.time)  # prints the loop runtime
    

    时差 3.934461 秒

    res.array = array(NA, dim=c(NC, NR, obs))
    for(z in seq(obs)) {
      res.array[,,z] = coredata(data - lag.xts(coeffs, z-1) * as.numeric(in.dep.var))
    }
    res.sd <- apply(res.array, c(1,2), function(z) z / sd(z))
    

    如果我在索引res.sd 中没有犯任何错误,应该会给你标准化的残差。请随时修复此解决方案以纠正任何错误。

    【讨论】:

      猜你喜欢
      • 2021-08-20
      • 2013-01-21
      • 1970-01-01
      • 2021-01-20
      • 2017-09-16
      • 2020-08-10
      • 1970-01-01
      • 2018-06-16
      • 1970-01-01
      相关资源
      最近更新 更多