加速 R 中的矩阵行和列操作答案

【问题标题】：Speeding up matrix row and column operations in R加速 R 中的矩阵行和列操作
【发布时间】：2016-12-28 01:46:29
【问题描述】：

我有一个正大矩阵：

set.seed(1)
mat <- matrix(abs(rnorm(130000*1000)),nrow=130000,ncol=1000)
rownames(mat) <- paste("r",1:nrow(mat),sep="")

mat 的 rownames 与 parent.id 相关联：

row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(13000,130000,replace=T)),sep=""))

这样每隔几行就与同一个parent.id相关联。

我需要为mat 中的每个row 计算这些操作：

mean of log 的行元素
mean该行占所有具有相同parent.id的行的比例
mean该行在所有具有相同parent.id的行中所占比例的概率
sdsd该行在所有具有相同parent.id的行中所占的比例概率

当然这是想到的第一个解决方案：

require(VGAM)
res.df <- do.call(rbind,mclapply(1:nrow(mat), function(x) {
  idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
  data.frame(mean.log=mean(log(mat[x,])),
             mean.proportion=mean(mat[x,]/apply(mat[idx,],2,sum)),
             mean.probit=mean(probit(mat[x,]/apply(mat[idx,],2,sum))),
             sd.probit=sd(probit(mat[x,]/apply(mat[idx,],2,sum))))
}))

但我想知道是否有任何方法可以更快地实现这一目标。

附言

我不认为使用data.table 代替矩阵是要走的路：

require(data.table)
require(microbenchmark)
require(VGAM)

set.seed(1)
mat <- data.table(matrix(abs(rnorm(13*5)),nrow=13,ncol=5))
rownames(mat) <- paste("r",1:nrow(mat),sep="")
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(2,13,replace=T)),sep=""))

microbenchmark(df <- do.call(rbind,lapply(1:nrow(mat), function(x) {
  idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
  data.frame(mean.log=mean(as.numeric(log(mat[x,]))),
             mean.proportion=mean(as.numeric(mat[x,])/apply(mat[idx,],2,sum)),
             mean.probit=mean(probit(as.numeric(mat[x,])/apply(mat[idx,],2,sum))),
             sd.probit=sd(probit(as.numeric(mat[x,])/apply(mat[idx,],2,sum))))
})))


expr
 df <- do.call(rbind, lapply(1:nrow(mat), function(x) {     idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id ==          row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])     data.frame(mean.log = mean(as.numeric(log(mat[x, ]))), mean.proportion = mean(as.numeric(mat[x,          ])/apply(mat[idx, ], 2, sum)), mean.probit = mean(probit(as.numeric(mat[x,          ])/apply(mat[idx, ], 2, sum))), sd.probit = sd(probit(as.numeric(mat[x,          ])/apply(mat[idx, ], 2, sum)))) }))
      min       lq     mean   median       uq     max neval
 65.08929 66.49415 69.78937 67.70534 70.38044 206.017   100
>

相比：

set.seed(1)
mat <- matrix(abs(rnorm(13*5)),nrow=13,ncol=5)
rownames(mat) <- paste("r",1:nrow(mat),sep="")
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(2,13,replace=T)),sep=""))

require(VGAM)
microbenchmark(df <- do.call(rbind,lapply(1:nrow(mat), function(x) {
  idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
  data.frame(mean.log=mean(log(mat[x,])),
             mean.proportion=mean(mat[x,]/apply(mat[idx,],2,sum)),
             mean.probit=mean(probit(mat[x,]/apply(mat[idx,],2,sum))),
             sd.probit=sd(probit(mat[x,]/apply(mat[idx,],2,sum))))
})))


Unit: milliseconds
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            expr
 df <- do.call(rbind, lapply(1:nrow(mat), function(x) {     idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id ==          row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])     data.frame(mean.log = mean(log(mat[x, ])), mean.proportion = mean(mat[x,          ]/apply(mat[idx, ], 2, sum)), mean.probit = mean(probit(mat[x,          ]/apply(mat[idx, ], 2, sum))), sd.probit = sd(probit(mat[x,          ]/apply(mat[idx, ], 2, sum)))) }))
      min      lq     mean median       uq      max neval
 10.15047 10.2894 10.69573 10.428 10.69741 14.56724   100

除非每次我想在data.table 行上运行操作时都应用as.numeric 是个坏主意。

【问题讨论】：

你试过data.table吗？
没有。好主意。
这实际上不是一个好主意 - 请参阅更新后的帖子
使用 data.table 的方式有一些技巧。我实际上并不是那么专家，但是一旦 akrun 或有人看到这一点，他们就会向您展示。

标签： r performance matrix parallel-processing

【解决方案1】：

我不认为使用 data.table 代替矩阵是要走的路

显然，您必须实际使用 data.table。它不是一根神奇的魔杖，可以在你不花一些精力的情况下优化你的代码。您需要使用 data.table 语法。

我需要为 mat 中的每一行计算这些操作：

mean of log of the row elements

mean proportion of of that row out of all rows with the same parent.id

mean probit of the proportion of of that row out of all rows with the same parent.id

sd probit of the proportion of of that row out of all rows with the same parent.id

我认为这可能会满足您的需求：

library(data.table)
DT <- data.table(row.ids.df, mat)
DT <- melt(DT, id.vars = c("row.id", "parent.id"))

DT[, proportion := value / sum(value), by = .(variable, parent.id)]

res <- DT[, .(
  mean.log = mean(log(value)),
  mean.proportion = mean(proportion),
  mean.probit = mean(probit(proportion)),
  sd.probit = sd(probit(proportion))), by = row.id]

all.equal(res[["sd.probit"]], 
          res.df[["sd.probit"]])
#[1] TRUE
#(Tested with 100 rows and 30 columns.)

我希望它更高效，但它肯定更具可读性。

【讨论】：