【发布时间】:2021-03-15 05:28:48
【问题描述】:
我有一个数据集,其中包含每个 ID 的一系列日期。我已经生成了一系列超前和滞后变量,现在我想生成另一组变量,其中包含每行中超前和滞后变量之间的天数差异。当我生成超前和滞后变量时,我使用 paste0 为每个变量名称附加一个数字。例如,滞后变量被命名为 prev_date1:prev_date20。我希望能够使用这些数字生成另一组变量来计算对之间的天数差异。一般形式由下式给出:
diff2prev[i] = prev_date[i-1] - prev_date[i]
但我不知道如何在实践中实现这一点。在我最初的方法中,我只有 7 个变量并将它们分别写出来(包括此示例代码),但现在我需要生成 7 个以上的变量,因此我想找到一种更有效的方法来做到这一点。如示例中所示,我已尝试使用 data.table 和 dplyr,但到目前为止都没有。任何关于我哪里出错以及如何改进我的代码的指针将不胜感激。
if (!require('pacman')) install.packages('pacman'); library(pacman)
#> Loading required package: pacman
p_load("dplyr", "lubridate","tidyverse")
id <- c(13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15)
date <- c("2017-06-06", "2017-07-26", "2017-09-22", "2017-10-21", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2018-03-29", "2019-05-12", "2019-06-07", "2019-10-08","2016-10-20", "2016-10-20", "2016-10-20", "2016-10-20", "2018-01-06", "2018-01-06", "2018-01-06", "2018-01-06", "2018-01-06","2018-01-06", "2018-05-02", "2018-08-04", "2018-08-04", "2018-08-04", "2018-11-22", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2018-12-26", "2019-05-11","2019-06-04", "2019-11-18", "2016-04-01", "2018-04-04", "2019-04-03", "2019-04-04", "2019-04-04", "2019-04-04", "2019-04-04","2019-04-04", "2019-04-04", "2019-04-04", "2019-04-04", "2019-04-04", "2019-04-04", "2019-04-04", "2019-06-03", "2019-06-04", "2019-11-23")
sample <- bind_cols(id, date)
#> New names:
#> * NA -> ...1
#> * NA -> ...2
colnames <- c("id", "date")
names(sample) <- colnames
sample <- sample %>%
group_by(id) %>%
mutate(date = as_date(date))
#Using data.table shift/lag to create 20 prev dates
sample[,paste0('prev_date', 1:20) := shift(date, 1:20, type="lag"), by = id][]
#Using data.table shift/lead to create 20 prev dates
sample[,paste0('next_date', 1:20) := shift(date, 1:20, type="lead"), by = id][]
这是我目前尝试过的
## Dplyr approach to writing out each new variable
##This works but seems inefficient
sample <- sample %>%
group_by(id) %>%
mutate(diff2prev = date - prev_date,
diff2prev1 = prev_date - prev_date1,
diff2prev2 = prev_date1 - prev_date2,
diff2prev3 = prev_date2 - prev_date3,
diff2prev4 = prev_date3 - prev_date4,
diff2prev5 = prev_date4 - prev_date5,
diff2prev6 = prev_date5 - prev_date6,
diff2prev7 = prev_date6 - prev_date7,
diff2next = next_date - date,
diff2next1 = next_date1 - next_date,
diff2next2 = next_date2 - next_date1,
diff2next3 = next_date3 - next_date2,
diff2next4 = next_date4 - next_date3,
diff2next5 = next_date5 - next_date4,
diff2next6 = next_date6 - next_date5,
diff2next7 = next_date7 - next_date6)
##Attempt at using data.table to generate variables but not sure how to incorporate the length of [i] for iteration
setDT(pid_ell)[,paste0('diff2prev', 1:20) := (diff2prev[i] = prev_date[i-1] - prev_date[i], 1:20), by = id][]
##Attempt to create a function that would create the new empty variables and then fill them in
#function to create variable calculating the difference in days to the previous date
fn_diff2prev <- function(date, prev_date) {
for (i in 2:lead_lag){
diff2prev[i] <- paste0('diff2prev', 1:20) # new var names
}
diff2prev1 <- date - prev_date1 #first one calculates from date
for (i in 2:lead_lag){
diff2prev[i] <- prev_date[i-1] - prev_date[i] #others calculate based on [i]
}
return
}
【问题讨论】:
标签: r date data.table dplyr