以下函数将用于传递数据集中每个组的所有row_numbers,然后在不替换的情况下绘制sample,然后使用split的组合删除所有落在步长范围内的值和findInterval。返回的数组将用于slice 以所需的样本步长输出所需的样本大小。
根据需要修改sample_size和sample_step以调整初始样本大小和保留样本之间的行数
library(plyr)
sample_drop <- function(x, sample_size, sample_step=1) {
# draw sample and convert to list
lst_samp <- list(sort(sample(x, size=sample_size, replace=FALSE)))
# function to split last element of list by step size
split_last <- function(lst, step) {
lst_tail <- unlist(tail(lst, n=1L))
split(lst_tail, findInterval(lst_tail, c(0, step) + min(lst_tail)))
}
# split list until all values of last element fall within step size
while(do.call(function(x) max(x) - min(x), list(unlist(tail(lst_samp, n=1L)))) >= sample_step) {
lst_samp <- c(head(lst_samp, n=-1L), split_last(lst_samp, sample_step))
}
#lst_samp <- llply(lst_samp, unname) # for debug only to remove attr names
laply(lst_samp, min) # return minimum value from each element
}
这是应用于iris 数据集的函数。
library(dplyr)
data("iris")
sample <- list()
sample$seed <- 1
sample$size <- 15L
sample$step <- 20L
# simulate sample draws with dropping and compare to iris results
set.seed(sample$seed)
sample_drop(50, sample$size, sample$step)
sample_drop(50, sample$size, sample$step)
sample_drop(50, sample$size, sample$step)
set.seed(sample$seed)
iris %>%
group_by(Species) %>%
mutate(gid=row_number()) %>%
slice(sample_drop(n(), sample$size, sample$step))
这是应用于更大的diamonds 数据集的函数
library(dplyr)
library(ggplot2)
data("diamonds")
sample <- list()
sample$seed <- 1
sample$size <- 1000L
sample$step <- 20L
set.seed(sample$seed)
diamonds %>%
group_by(cut) %>%
mutate(gid=row_number()) %>%
slice(sample_drop(n(), sample$size, sample$step))
set.seed(sample$seed)
diamonds %>%
group_by(cut) %>%
mutate(gid=row_number()) %>%
slice(sample_drop(n(), sample$size, sample$step)) %>%
summarise(samples=n())
可能还有改进的余地,但这对我来说更容易理解