TL;DR
如果你有 data.table 1.9.6 使用
DT[, .SD[1L], by = .(Sequence, Parameter, Experiment)]
如果您使用的是 1.9.7+,您也可以使用
DT[, head(.SD,1L), by = .(Sequence, Parameter, Experiment)]
一些基准测试
低于 1e6 和 1e7 的基准。由于最近引入了head(.SD, 1) 的优化,还测试了data.table 的dev 版本。
我使用了@HywelMJ 生成的数据集,但它似乎没有按照Time 列的顺序反映OP 数据集,因此数据是无序的。一旦 OP 将提供可重现的示例,我可能会更新时间。
由于不同的排序和 HywelMJ 中使用的mult="first" 答案,结果不同。我假设(查看 OP 数据的打印)Jaap 和 nicola 的答案是正确的。
# 1e6 - data.table 1.9.6 ----
# install.packages("data.table")
packageVersion("data.table")
#[1] ‘1.9.6’
library(data.table)
DT <- as.data.table(cbind(Experiment = round(runif(1000000,min = 1, max = 2000)),Parameter = round(runif(1000000,min = 1,max = 9))))
DT[,Sequence:= seq_len(.N),keyby = Experiment]
DT[,Time := sample(1:60,.N,replace = TRUE)]
DT.backup = DT[sample(nrow(DT))] # ensure random order
DT = copy(DT.backup)
system.time(
r.head <- DT[, head(.SD,1L), by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 8.420 0.000 8.408
DT = copy(DT.backup)
system.time(
r.sd1 <- DT[, .SD[1L], by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 0.664 0.000 0.664
DT = copy(DT.backup)
system.time({
setkey(DT,Sequence,Parameter,Experiment)
r.join <- DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]
})
# user system elapsed
# 0.332 0.000 0.331
all.equal(r.head, r.sd1)
#[1] TRUE
all.equal(r.head[order(Sequence,Parameter,Experiment), .(Sequence,Parameter,Experiment,Time)],
r.join[order(Sequence,Parameter,Experiment), .(Sequence,Parameter,Experiment,Time)])
#[1] "Attributes: < Length mismatch: comparison on first 1 components >"
# 1e7 - data.table 1.9.6 ----
DT <- as.data.table(cbind(Experiment = round(runif(1e7,min = 1, max = 2000)),Parameter = round(runif(1e7,min = 1,max = 9))))
DT[,Sequence:= seq_len(.N),keyby = Experiment]
DT[,Time := sample(1:60,.N,replace = TRUE)]
DT.backup = DT[sample(nrow(DT))] # ensure random order
DT = copy(DT.backup)
system.time(
r.head <- DT[, head(.SD,1L), by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 85.848 0.064 85.829
DT = copy(DT.backup)
system.time(
r.sd1 <- DT[, .SD[1L], by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 7.164 0.044 7.201
DT = copy(DT.backup)
system.time({
setkey(DT,Sequence,Parameter,Experiment)
r.join <- DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]
})
# user system elapsed
# 3.440 0.080 3.516
all.equal(r.head, r.sd1)
#[1] TRUE
all.equal(r.head[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)],
r.join[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)])
#[1] "Attributes: < Length mismatch: comparison on first 1 components >"
# 1e6 - data.table 1.9.7 ----
# devtools::install_github("Rdatatable/data.table")
packageVersion("data.table")
#[1] ‘1.9.7’
library(data.table)
DT <- as.data.table(cbind(Experiment = round(runif(1000000,min = 1, max = 2000)),Parameter = round(runif(1000000,min = 1,max = 9))))
DT[,Sequence:= seq_len(.N),keyby = Experiment]
DT[,Time := sample(1:60,.N,replace = TRUE)]
DT.backup = DT[sample(nrow(DT))] # ensure random order
DT = copy(DT.backup)
system.time(
r.head <- DT[, head(.SD,1L), by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 0.236 0.008 0.242
DT = copy(DT.backup)
system.time(
r.sd1 <- DT[, .SD[1L], by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 0.216 0.004 0.220
DT = copy(DT.backup)
system.time({
setkey(DT,Sequence,Parameter,Experiment)
r.join <- DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]
})
# user system elapsed
# 0.324 0.000 0.324
all.equal(r.head, r.sd1)
#[1] TRUE
all.equal(r.head[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)],
r.join[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)])
#[1] "Attributes: < Length mismatch: comparison on first 1 components >"
# 1e7 - data.table 1.9.7 ----
DT <- as.data.table(cbind(Experiment = round(runif(1e7,min = 1, max = 2000)),Parameter = round(runif(1e7,min = 1,max = 9))))
DT[,Sequence:= seq_len(.N),keyby = Experiment]
DT[,Time := sample(1:60,.N,replace = TRUE)]
DT.backup = DT[sample(nrow(DT))] # ensure random order
DT = copy(DT.backup)
system.time(
r.head <- DT[, head(.SD,1L), by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 2.676 0.056 2.732
DT = copy(DT.backup)
system.time(
r.sd1 <- DT[, .SD[1L], by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 2.620 0.112 2.728
DT = copy(DT.backup)
system.time({
setkey(DT,Sequence,Parameter,Experiment)
r.join <- DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]
})
# user system elapsed
# 3.636 0.084 3.714
all.equal(r.head, r.sd1)
#[1] TRUE
all.equal(r.head[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)],
r.join[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)])
#[1] "Attributes: < Length mismatch: comparison on first 1 components >"