我在自己的数据上测试了@flodel 和@Sven Hohenstein 提出的不同解决方案(请注意,@Martin Morgan 的方法暂时无法测试,因为它不支持 a 的前缀为a 的其他元素)。
重要提示:尽管所有方法在我的具体情况下给出相同的结果,但提醒他们都有自己的方式,因此可以根据数据的结构给出不同的结果
这里是一个快速的总结(结果如下所示):
- 在我的测试中,
length(a) 和 length(b) 分别设置为 200 或 400 和 2,000 或 10,000
-
b 中a 的每个值只有一个匹配项
- 最好的方法确实取决于问题,并且都值得针对每个具体案例进行测试
-
pmatch 总是表现得非常好(特别是对于小长度的向量 a 和 b,分别小于 100 和 1,000 - 未在下面显示),
-
sapply(a, grep, b, fixed=T) 和 reduced.match(flodel 方法)函数的性能始终优于 sapply(a, grep, b)) 和 sapply(paste0("^", a), grep, b)。
这是可重现的代码以及测试结果
# set up the data set
library(microbenchmark)
categ <- c("Control", "Gr", "Or", "PMT", "P450")
genes <- paste(categ, rep(1:40, each=length(categ)), sep="_")
a0 <- paste(genes, "_", rep(1:50, each=length(genes)), "_", sep="")
b0 <- paste (a0, "1", sep="")
# length(a)==200 & length(b)==2,000
ite <- 200
lg <- 2000
b <- b0[1:lg]
a <- (a0[1:lg])[sample(seq(lg), ite)]
microbenchmark(as.vector(sapply(a, grep, b)), # original
as.vector(sapply(paste0("^", a), grep, b)), # @flodel 1
as.vector(sapply(a, grep, b, fixed = TRUE)), # Sven Hohenstein
unlist(reduced.match(a, b)), # @ flodel 2
#~ f3(a, b), @Martin Morgan
pmatch(a, b))
Unit: milliseconds
expr min lq median
as.vector(sapply(a, grep, b)) 188.810585 189.256705 189.827765
as.vector(sapply(paste0("^", a), grep, b)) 157.600510 158.113507 158.560619
as.vector(sapply(a, grep, b, fixed = TRUE)) 23.954520 24.109275 24.269991
unlist(reduced.match(a, b)) 7.999203 8.087931 8.140260
pmatch(a, b) 7.459394 7.489923 7.586329
uq max neval
191.412879 222.131220 100
160.129008 186.695822 100
25.923741 26.380578 100
8.237207 10.063783 100
7.637560 7.888938 100
# length(a)==400 & length(b)==2,000
ite <- 400
lg <- 2000
b <- b0[1:lg]
a <- (a0[1:lg])[sample(seq(lg), ite)]
microbenchmark(as.vector(sapply(a, grep, b)), # original
as.vector(sapply(paste0("^", a), grep, b)), # @flodel 1
as.vector(sapply(a, grep, b, fixed = TRUE)), # Sven Hohenstein
unlist(reduced.match(a, b)), # @ flodel 2
#~ f3(a, b), @Martin Morgan
pmatch(a, b))
Unit: milliseconds
expr min lq median
as.vector(sapply(a, grep, b)) 376.85638 379.58441 380.46107
as.vector(sapply(paste0("^", a), grep, b)) 314.38333 316.79849 318.33426
as.vector(sapply(a, grep, b, fixed = TRUE)) 49.56848 51.54113 51.90420
unlist(reduced.match(a, b)) 13.31185 13.44923 13.57679
pmatch(a, b) 15.15788 15.24773 15.36917
uq max neval
383.26959 415.23281 100
320.92588 346.66234 100
52.02379 81.65053 100
15.56503 16.83750 100
15.45680 17.58592 100
# length(a)==200 & length(b)==10,000
ite <- 200
lg <- 10000
b <- b0[1:lg]
a <- (a0[1:lg])[sample(seq(lg), ite)]
microbenchmark(as.vector(sapply(a, grep, b)), # original
as.vector(sapply(paste0("^", a), grep, b)), # @flodel 1
as.vector(sapply(a, grep, b, fixed = TRUE)), # Sven Hohenstein
unlist(reduced.match(a, b)), # @ flodel 2
#~ f3(a, b), @Martin Morgan
pmatch(a, b))
Unit: milliseconds
expr min lq median
as.vector(sapply(a, grep, b)) 975.34831 978.55579 981.56864
as.vector(sapply(paste0("^", a), grep, b)) 808.79299 811.64919 814.16552
as.vector(sapply(a, grep, b, fixed = TRUE)) 119.64240 120.41718 120.73548
unlist(reduced.match(a, b)) 34.23893 34.56048 36.23506
pmatch(a, b) 37.57552 37.82128 38.01727
uq max neval
986.17827 1061.89808 100
824.41931 854.26298 100
121.20605 151.43524 100
36.57896 43.33285 100
38.21910 40.87238 100
# length(a)==400 & length(b)==10500
ite <- 400
lg <- 10000
b <- b0[1:lg]
a <- (a0[1:lg])[sample(seq(lg), ite)]
microbenchmark(as.vector(sapply(a, grep, b)), # original
as.vector(sapply(paste0("^", a), grep, b)), # @flodel 1
as.vector(sapply(a, grep, b, fixed = TRUE)), # Sven Hohenstein
unlist(reduced.match(a, b)), # @ flodel 2
#~ f3(a, b), @Martin Morgan
pmatch(a, b))
Unit: milliseconds
expr min lq median
as.vector(sapply(a, grep, b)) 1977.69564 2003.73443 2028.72239
as.vector(sapply(paste0("^", a), grep, b)) 1637.46903 1659.96661 1677.21706
as.vector(sapply(a, grep, b, fixed = TRUE)) 236.81745 238.62842 239.67875
unlist(reduced.match(a, b)) 57.18344 59.09308 59.48678
pmatch(a, b) 75.03812 75.40420 75.60641
uq max neval
2076.45628 2223.94624 100
1708.86306 1905.16534 100
241.12830 283.23043 100
59.76167 88.71846 100
75.99034 90.62689 100