【问题标题】:adding several lags/shifts to list of columns在列列表中添加几个滞后/移位
【发布时间】:2020-05-04 15:06:48
【问题描述】:

我想滞后几列(例如 value_1 + value_2 + x - 见下文),定义它们的滞后数(例如 3)及其命名。这是一些工作繁琐/手动的代码:

library(dplyr)
library(lubridate)
library(data.table)

haves <- data.frame(
      id = c("a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b")
    , date = as.Date(c("2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01", "2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01"))
    , value_1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
    , value_2 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
    , x = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
) 
haves$value_2 <- haves$value_2 + 1
haves$x <- haves$x + 2

haves

wants <- haves %>%
    group_by(id) %>% 
    mutate(
        value_1_lag_1 = lag(value_1, n = 1, order_by = date)
        , value_1_lag_2 = lag(value_1, n = 2, order_by = date)
        , value_1_lag_3 = lag(value_1, n = 3, order_by = date)

        , value_2_lag_1 = lag(value_2, n = 1, order_by = date)
        , value_2_lag_2 = lag(value_2, n = 2, order_by = date)
        , value_2_lag_3 = lag(value_2, n = 3, order_by = date)

        , x_lag_1 = lag(x, n = 1, order_by = date)
        , x_lag_2 = lag(x, n = 2, order_by = date)
        , x_lag_3 = lag(x, n = 3, order_by = date)
    )

wants

有人建议this提供解决方案,所以我尝试了数据表的方法,但没有成功:

setDT(haves)
haves[, sapply(1:3, function(x){paste0('', x, '_lag_', 1:3)}) := shift(.SD, 1:3), 
   by = id, .SDcols = value_1:x][]

它不会满足我的需求。这更接近了:

colnames <- colnames(haves)

setDT(haves)
haves[, sapply(1:3, function(x){paste0(colnames[x + 2], x, '_lag_', 1:3)}) := shift(.SD, 1:3), by = id, .SDcols = value_1:x][]

或者,我可以只使用循环和这样的函数:

appender <- function(df, column, lag){

    df %>%
        group_by(
            id
        ) %>%
        mutate(
            !!paste0(column, "_lag_", lag) := lag(!!rlang::sym(column), n = lag, order_by = date) 
        )
}

temp <- appender(haves, "value_2", 3)

任何帮助将不胜感激。谢谢!

【问题讨论】:

    标签: r dplyr


    【解决方案1】:

    这是通过 data.table 的另一种方式。

    library(data.table)
    library(lubridate)
    #> 
    #> Attaching package: 'lubridate'
    #> The following objects are masked from 'package:data.table':
    #> 
    #>     hour, isoweek, mday, minute, month, quarter, second, wday, week,
    #>     yday, year
    #> The following object is masked from 'package:base':
    #> 
    #>     date
    library(stringr)
    
    haves <- data.frame(
      id = c("a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b")
      , date = as.Date(c("2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01", "2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01"))
      , value_1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
      , value_2 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
      , value_3 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
    ) 
    
    setDT(haves)
    setorder(haves, date)
    
    N_vars = 3
    N_lags = 3
    current_vars = paste0( "value_", rep(1:N_vars, each = 1) )
    new_vars = paste0( "value_", rep(1:N_vars, each = N_lags), "_lag_", 1:N_lags )
    
    to_define = new_vars[ str_detect(new_vars, "lag_1") ]
    haves[ , (new_vars) := shift( .SD, 1:N_lags ), .SDcols = current_vars]
    haves
    #>     id       date value_1 value_2 value_3 value_1_lag_1 value_1_lag_2
    #>  1:  a 2015-01-01       1       1       1            NA            NA
    #>  2:  b 2015-01-01       7       7       7             1            NA
    #>  3:  a 2015-02-01       2       2       2             7             1
    #>  4:  b 2015-02-01       8       8       8             2             7
    #>  5:  a 2015-03-01       3       3       3             8             2
    #>  6:  b 2015-03-01       9       9       9             3             8
    #>  7:  a 2015-04-01       4       4       4             9             3
    #>  8:  b 2015-04-01      10      10      10             4             9
    #>  9:  a 2015-05-01       5       5       5            10             4
    #> 10:  b 2015-05-01      11      11      11             5            10
    #> 11:  a 2015-06-01       6       6       6            11             5
    #> 12:  b 2015-06-01      12      12      12             6            11
    #>     value_1_lag_3 value_2_lag_1 value_2_lag_2 value_2_lag_3 value_3_lag_1
    #>  1:            NA            NA            NA            NA            NA
    #>  2:            NA             1            NA            NA             1
    #>  3:            NA             7             1            NA             7
    #>  4:             1             2             7             1             2
    #>  5:             7             8             2             7             8
    #>  6:             2             3             8             2             3
    #>  7:             8             9             3             8             9
    #>  8:             3             4             9             3             4
    #>  9:             9            10             4             9            10
    #> 10:             4             5            10             4             5
    #> 11:            10            11             5            10            11
    #> 12:             5             6            11             5             6
    #>     value_3_lag_2 value_3_lag_3
    #>  1:            NA            NA
    #>  2:            NA            NA
    #>  3:             1            NA
    #>  4:             7             1
    #>  5:             2             7
    #>  6:             8             2
    #>  7:             3             8
    #>  8:             9             3
    #>  9:             4             9
    #> 10:            10             4
    #> 11:             5            10
    #> 12:            11             5
    

    reprex package (v0.3.0) 于 2020 年 5 月 4 日创建

    【讨论】:

    • 谢谢。请注意,我引入了另一个变量 x 来强制解决方案更通用。
    【解决方案2】:

    这是来自链接答案的改编 dplyr 解决方案。

    haves %>%
      group_by(id) %>%
      nest %>%
      mutate(data = map(data, ~arrange(., date))) %>%
      mutate(lags = map(data, function(dat) {
        imap_dfc(dat[-1], ~set_names(map(1:3, lag, x = .x),
                                     paste0(.y, "_lag_", 1:3)))
      })) %>%
      unnest(c(data, lags))
    

    这就是你要找的吗?

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2023-02-08
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2021-02-16
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多