你也可以使用count:
library(dplyr)
df %>% count(x)
这将调用n() 来计算每个x 的观察次数:
# Source: local data frame [3 x 2]
#
# x n
# 1 . 1
# 2 ing 2
# 3 str 3
如果您只希望出现至少 3 次,请使用 filter():
df %>% count(x) %>% filter(n >= 3)
这给出了:
# Source: local data frame [1 x 2]
#
# x n
# 1 str 3
最后,如果您只想提取与您的过滤条件相对应的因素:
df %>% count(x) %>% filter(n >= 3) %>% .$x
# [1] str
# Levels: . ing str
根据 @David 在 cmets 中的建议,您也可以使用 data.table:
library(data.table)
setDT(df)[, if(.N >= 3) x, by = x]$V1
或者
setDT(df)[, .N, by = x][, x[N >= 3]]
# [1] str
# Levels: . ing str
根据@Frank 的建议,您也可以使用table 的“主力”tabulate:
levels(df[[1]])[tabulate(df[[1]])>=3]
# [1] "str"
基准测试
df <- data.frame(x = sample(LETTERS[1:26], 10e6, replace = TRUE))
df2 <- copy(df)
library(microbenchmark)
mbm <- microbenchmark(
base = names(which(table(df$x) >= 385000)),
base2 = levels(df[[1]])[tabulate(df[[1]])>385000L],
dplyr = count(df, x) %>% filter(n >= 385000) %>% .$x,
DT1 = setDT(df2)[, if(.N >= 385000) x, by = x]$V1,
DT2 = setDT(df2)[, .N, by = x][, x[N >= 385000]],
times = 50
)
> mbm
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# base 495.44936 523.29186 545.08199 543.56660 551.90360 652.13492 50 d
# base2 20.08123 20.09819 20.11988 20.10633 20.14137 20.20876 50 a
# dplyr 226.75800 227.27992 231.19709 228.36296 232.71308 259.20770 50 c
# DT1 41.03576 41.28474 50.92456 48.40740 48.66626 168.53733 50 b
# DT2 41.45874 41.85510 50.76797 48.93944 49.49339 74.58234 50 b