使用 data.table 加快行添加速度答案

【问题标题】：Speed up row addition with data.table使用 data.table 加快行添加速度
【发布时间】：2014-05-13 14:07:31
【问题描述】：

我用来自data.table 的 fread 读入了一个大数据集 (10 000 * 25 000)，它运行得非常快。现在我需要用它做一些基本的算术，相比之下，这相当慢。我想知道是否有人建议我可能做错了什么/有什么好的调整可以让它更快（我必须做 10 000 次，所以每一次改进都很重要！）。我想随机选择两次ngeno 行，将它们相加，并仅保留不全为 0（或 2）的列，例如在

  mytable = matrix(c(0, 0, 1, 2,
                     0, 1, 2, 2), ncol=4))

第 1 列和第 4 列需要被删除（分别是一开始就没有加起来；通常大约 40% 的列以这种方式被删除）。到目前为止我所拥有的

# Test data
nrow = 1000
ncol = 10000
ngeno = 2000
require(data.table)
dat <- data.table(matrix(rbinom(nrow*ncol, 2, 0.001), ncol=ncol))

# What I do
myway <- function(dat, nrow, ngeno) {
  set.seed(123)
  # Choose random rows
  haplo1 <- sample.int(nrow, ngeno, replace=T)
  haplo2 <- sample.int(nrow, ngeno, replace=T)
  # Add them up
  geno <- dat[haplo1,] + dat[haplo2,]
  rec.names <- names(dat)
  maf <- colMeans(geno)/2
  # throw out columns where every row has a 0 or a 2
  throw.out <- maf==0 | maf == 1
  rec.names <- rec.names[!throw.out]
  maf <- maf[!throw.out]
  geno <- subset(geno, select = rec.names)
  return(list(maf, geno))
}

myway2 <- function(dat, nrow, ngeno) {
  set.seed(123)
  haplo1 <- sample.int(nrow, ngeno, replace=T)
  haplo2 <- sample.int(nrow, ngeno, replace=T)
  dat <- data.table(t(dat))
  geno <- dat[,haplo1,with=F] + dat[,haplo2,with=F]
  geno <- data.table(t(geno))
  maf <- colMeans(geno)/2
  throw.out <- maf==0 | maf == 1
  maf <- maf[!throw.out]
  geno <- geno[, which(!throw.out), with=F]
  return(list(maf, geno))
}

eddisway <- function(dat, nrow, ngeno) {
  set.seed(123)
  dat.m <- as.matrix(dat)
  haplo1 <- sample.int(nrow, ngeno, replace=T)
  haplo2 <- sample.int(nrow, ngeno, replace=T)
  geno <- dat.m[haplo1,] + dat.m[haplo2,]
  maf <- colMeans(geno)/2
  throw.out <- maf==0 | maf == 1
  maf <- maf[!throw.out]
  geno <- geno[,!throw.out]
  return(list(maf, geno))
}

require(reshape2)
rolandsway <- function(dat, nrow, ngeno) {
  set.seed(123)
  dat1 <- melt(dat, variable.factor=FALSE)
  
  # Choose random rows
  haplo1 <- sample.int(nrow, ngeno, replace=T)
  haplo2 <- sample.int(nrow, ngeno, replace=T)
  
  geno <- dat1[, value[haplo1]+value[haplo2], by=variable]
  maf <- geno[, mean(V1)/2, by=variable]  
  
  maf <- maf[!(V1==0 | V1==1),]
  setkey(geno, variable)
  geno <- geno[maf[, variable],]
#   geno[,"v2":=rep(1:dim(dat)[1],dim(maf)[1]),with=F]
#   test <- dcast.data.table(geno, v2 ~ variable)
  return(list(maf, geno))
}

# Warning messages from Roland's method:
#   1: In melt.data.table(dat, variable.factor = FALSE) :
#   To be consistent with reshape2's melt, id.vars and measure.vars are internally guessed when both are 'NULL'. All non-numeric/integer/logical type columns are conisdered id.vars, which in this case are columns ''. Consider providing at least one of 'id' or 'measure' vars in future.

require(microbenchmark)
out <- microbenchmark(myway(dat, nrow, ngeno), myway2(dat, nrow, ngeno), eddisway(dat, nrow, ngeno), rolandsway(dat, nrow, ngeno), times=5)

目前的结果：

# Unit: seconds
#                               expr      min       lq   median       uq      max neval
# myway(dat, nrow, ngeno)      3.764377 3.804865 3.841819 3.924095 4.203679    10
# myway2(dat, nrow, ngeno)     3.595477 3.681658 3.703837 3.784004 3.851407    10
# eddisway(dat, nrow, ngeno)   1.388514 1.414389 1.438111 1.479081 1.574927    10
# rolandsway(dat, nrow, ngeno) 2.253587 2.299850 2.390655 2.579183 2.633778    10

分析myway 让我得到类似的东西

$by.self
                        self.time self.pct total.time total.pct
"[["                         0.94     18.8       2.56      51.2
"[[.data.frame"              0.54     10.8       1.62      32.4
"match"                      0.48      9.6       0.92      18.4
"[.data.table"               0.40      8.0       2.84      56.8
"Ops.data.frame"             0.34      6.8       1.44      28.8
"setattr"                    0.24      4.8       2.18      43.6
"<Anonymous>"                0.18      3.6       0.46       9.2

$by.total
                           total.time total.pct self.time self.pct
"myway"                          5.00     100.0      0.00      0.0
"[.data.table"                   2.84      56.8      0.40      8.0
"["                              2.84      56.8      0.00      0.0
"[["                             2.56      51.2      0.94     18.8
"alloc.col"                      2.26      45.2      0.06      1.2
"setattr"                        2.18      43.6      0.24      4.8
"+"                              1.96      39.2      0.00      0.0

由于“+”仅是从顶部算起的第 7 个（加法的行占用了大部分时间），我认为还有改进的余地。我尝试通过将 haplo1 和 haplo2 转换为布尔值来使用 subset，但这甚至需要更长的时间。编译也没有帮助。有什么建议可以让它更快吗？或者这已经结束了？

更新：

我更新了功能（另外，我修正了一个错字）。直到现在我才能用 Roland 的方法获得相同的输出格式。但即使没有它，Eddi 的建议似乎也更快。还有其他想法吗？

【问题讨论】：

您使用数据的方式只会减慢data.table 的速度，而应该使用矩阵（这并不是说没有更好的方法 - 我'我只是在评论现有代码）
感谢您的好提示！我实施了您的建议，到目前为止，它是最快的。我不会想到使用 data.table 会天真地减慢我的速度。

标签： r rows data.table subset

【解决方案1】：

melting 转换为长格式可以将速度提高 2 倍，但也会改变输出格式：

myway1 <- function(dat, nrow, ngeno) {

  dat1 <- melt(dat, variable.factor=FALSE)

  # Choose random rows
  haplo1 <- sample.int(nrow, ngeno, replace=T)
  haplo2 <- sample.int(nrow, ngeno, replace=T)

  geno <- dat1[, value[haplo1]+value[haplo2], by=variable]
  maf <- geno[, mean(V1)/2, by=variable]  

  maf <- maf[!(V1==0 | V1==1),]
  setkey(dat1, variable)
  geno <- dat1[maf[, variable],]
  return(list(maf, geno))
}

应该可以进一步改进。

【讨论】：

感谢您的建议。我需要与myway 相同的输出格式。不一定是data.table，也可以是宽格式。我尝试使用dcast.data.table 获取它，但到目前为止失败了（我还修复了我的一种类型；我想返回geno 的子集，而不是dat 的子集）。有趣的是，Eddi 的方法似乎更快，没有任何 data.table 方法。