【发布时间】:2016-12-28 01:46:29
【问题描述】:
我有一个正大矩阵:
set.seed(1)
mat <- matrix(abs(rnorm(130000*1000)),nrow=130000,ncol=1000)
rownames(mat) <- paste("r",1:nrow(mat),sep="")
mat 的 rownames 与 parent.id 相关联:
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(13000,130000,replace=T)),sep=""))
这样每隔几行就与同一个parent.id相关联。
我需要为mat 中的每个row 计算这些操作:
meanoflog的行元素mean该行占所有具有相同parent.id的行的比例mean该行在所有具有相同parent.id的行中所占比例的概率sdsd该行在所有具有相同parent.id的行中所占的比例概率
当然这是想到的第一个解决方案:
require(VGAM)
res.df <- do.call(rbind,mclapply(1:nrow(mat), function(x) {
idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
data.frame(mean.log=mean(log(mat[x,])),
mean.proportion=mean(mat[x,]/apply(mat[idx,],2,sum)),
mean.probit=mean(probit(mat[x,]/apply(mat[idx,],2,sum))),
sd.probit=sd(probit(mat[x,]/apply(mat[idx,],2,sum))))
}))
但我想知道是否有任何方法可以更快地实现这一目标。
附言
我不认为使用data.table 代替矩阵是要走的路:
require(data.table)
require(microbenchmark)
require(VGAM)
set.seed(1)
mat <- data.table(matrix(abs(rnorm(13*5)),nrow=13,ncol=5))
rownames(mat) <- paste("r",1:nrow(mat),sep="")
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(2,13,replace=T)),sep=""))
microbenchmark(df <- do.call(rbind,lapply(1:nrow(mat), function(x) {
idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
data.frame(mean.log=mean(as.numeric(log(mat[x,]))),
mean.proportion=mean(as.numeric(mat[x,])/apply(mat[idx,],2,sum)),
mean.probit=mean(probit(as.numeric(mat[x,])/apply(mat[idx,],2,sum))),
sd.probit=sd(probit(as.numeric(mat[x,])/apply(mat[idx,],2,sum))))
})))
expr
df <- do.call(rbind, lapply(1:nrow(mat), function(x) { idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])]) data.frame(mean.log = mean(as.numeric(log(mat[x, ]))), mean.proportion = mean(as.numeric(mat[x, ])/apply(mat[idx, ], 2, sum)), mean.probit = mean(probit(as.numeric(mat[x, ])/apply(mat[idx, ], 2, sum))), sd.probit = sd(probit(as.numeric(mat[x, ])/apply(mat[idx, ], 2, sum)))) }))
min lq mean median uq max neval
65.08929 66.49415 69.78937 67.70534 70.38044 206.017 100
>
相比:
set.seed(1)
mat <- matrix(abs(rnorm(13*5)),nrow=13,ncol=5)
rownames(mat) <- paste("r",1:nrow(mat),sep="")
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(2,13,replace=T)),sep=""))
require(VGAM)
microbenchmark(df <- do.call(rbind,lapply(1:nrow(mat), function(x) {
idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
data.frame(mean.log=mean(log(mat[x,])),
mean.proportion=mean(mat[x,]/apply(mat[idx,],2,sum)),
mean.probit=mean(probit(mat[x,]/apply(mat[idx,],2,sum))),
sd.probit=sd(probit(mat[x,]/apply(mat[idx,],2,sum))))
})))
Unit: milliseconds
expr
df <- do.call(rbind, lapply(1:nrow(mat), function(x) { idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])]) data.frame(mean.log = mean(log(mat[x, ])), mean.proportion = mean(mat[x, ]/apply(mat[idx, ], 2, sum)), mean.probit = mean(probit(mat[x, ]/apply(mat[idx, ], 2, sum))), sd.probit = sd(probit(mat[x, ]/apply(mat[idx, ], 2, sum)))) }))
min lq mean median uq max neval
10.15047 10.2894 10.69573 10.428 10.69741 14.56724 100
除非每次我想在data.table 行上运行操作时都应用as.numeric 是个坏主意。
【问题讨论】:
-
你试过data.table吗?
-
没有。好主意。
-
这实际上不是一个好主意 - 请参阅更新后的帖子
-
使用 data.table 的方式有一些技巧。我实际上并不是那么专家,但是一旦 akrun 或有人看到这一点,他们就会向您展示。
标签: r performance matrix parallel-processing