我已经多次回到这个问题,并且一直在寻找最有效的方法。
我以前使用过 map-reduce 方法,但注意到它不能很好地扩展到具有宽日期间隔的大型数据帧。我刚刚尝试使用 lubridate 包中的 interval 类,发现它是迄今为止最快的实现。
这是最终代码:
library(tidyverse)
library(lubridate)
# Initialize a dataframe with start and end "active" dates per object
N = 1000
id_dates = tibble(id = 1 : N) %>%
mutate(
start = sample(seq(as.Date('2018-1-1'), as.Date('2019-1-1'), by = "day"), size = N, replace = TRUE),
end = start + sample(7 : 100, size = N, replace = TRUE),
interval = interval(start, end))
# Use the %within% command to calculate the number of active items per date
queue_history = tibble(Date = seq(min(id_dates$start), max(id_dates$end), by = "1 day")) %>%
rowwise() %>%
mutate(numInWIP = sum(Date %within% id_dates$interval)) %>%
ungroup()
这里有一些基准表明 lubridate 解决方案比当前答案和 map-reduce 方法快得多
library(tidyverse)
library(lubridate)
# Initialize a dataframe with start and end "active" dates per object
N = 1000
id_dates = tibble(id = 1 : N) %>%
mutate(
start = sample(seq(as.Date('2018-1-1'), as.Date('2019-1-1'), by = "day"), size = N, replace = TRUE),
end = start + sample(7 : 100, size = N, replace = TRUE),
interval = interval(start, end))
# a map-reduce solution
method_mapreduce = function() {
queue_history = as.tibble(table(reduce(map2(id_dates$start, id_dates$end, seq, by = 1), c)))
queue_history = queue_history %>%
rename(Date = Var1, numInWIP = Freq) %>%
mutate(Date = as_date(Date))
return (queue_history)
}
# a lubridate interval solution
method_intervals = function() {
date_df = tibble(Date = seq(min(id_dates$start), max(id_dates$end), by = "1 day"))
queue_history = date_df %>%
rowwise() %>%
mutate(numInWIP = sum(Date %within% id_dates$interval))
return (queue_history)
}
# current best answer
method_currentsolution = function() {
date_df = tibble(Date = seq(min(id_dates$start), max(id_dates$end), by = "1 day"))
queue_history = merge(id_dates, date_df, all=TRUE) %>%
filter(Date >= start, Date <= end) %>%
group_by(Date) %>%
summarise(n())
}
# Compare with benchmarks
tst = microbenchmark::microbenchmark(
method_mapreduce(),
method_intervals(),
method_currentsolution(),
times = 5)
microbenchmark::autoplot.microbenchmark(tst) +
scale_y_log10(
name = sprintf("Time [%s]", attr(summary(tst), "unit")),
breaks = scales::trans_breaks("log10", function(x) round(10^x)))