【问题标题】:Creating Episodes for Groups based on Date Sequence根据日期顺序为组创建剧集
【发布时间】:2020-02-28 20:59:36
【问题描述】:

我正在尝试按变量 (ID) 对数据进行分组,然后根据日期创建剧集。这篇文章帮助我创建了我正在寻找的输出,但我不知道如何为分组变量 (ID) 创建剧集。 Breaking down a timed sequence into episodes

上面链接帖子中的建议效果很好,但仅适用于一个 ID。

runs <-rle(df$EpisodeTimeCriterian)$lengths
df$Episode <- rep(1:length(runs),runs)

我最喜欢使用 dplyr 对数据进行分组,但是当我尝试 group_by 然后创建 Episode 变量时,我收到了一个错误。

df %>%
   group_by(ID)%>%
   mutate(Episode = rep(1:length(runs),runs))

Error: Column `Episode` must be length 42 (the group size) or one, not 66

更新:

感谢 Ben 在下面的建议,我能够按个人 ID 对它们进行分组,但现在我意识到我做错了日期之间的时间。如果自上一个日期起已过去 30 天以上,我希望开始新的一集。我以为我是通过计算两者之间的 difftime 来实现的,但它不起作用。

我想要预期的剧集:

# A tibble: 24 x 5
   ID    Date       days_until_next EpisodeTimeCriterian expected
   <chr> <date>               <dbl> <lgl>                   <dbl>
 1 456   2013-10-07               7 TRUE                        1
 2 456   2013-10-14             119 FALSE                       1
 3 456   2014-02-10             220 FALSE                       2
 4 456   2014-09-18               4 TRUE                        3
 5 456   2014-09-22               3 TRUE                        3
 6 456   2014-09-25               7 TRUE                        3
 7 456   2014-10-02               6 TRUE                        3
 8 456   2014-10-08               8 TRUE                        3
 9 456   2014-10-16              97 FALSE                       3
10 456   2015-01-21              15 TRUE                        4
11 456   2015-02-05              21 TRUE                        4
12 456   2015-02-26              41 FALSE                       4
13 456   2015-04-08              57 FALSE                       5
14 456   2015-06-04              12 TRUE                        6
15 456   2015-06-16               2 TRUE                        6
16 456   2015-06-18              49 FALSE                       6
17 456   2015-08-06              14 TRUE                        7
18 456   2015-08-20              42 FALSE                       7
19 456   2015-10-01              12 TRUE                        8
20 456   2015-10-13              16 TRUE                        8
21 456   2015-10-29              12 TRUE                        8
22 456   2015-11-10              65 FALSE                       8
23 456   2016-01-14               1 TRUE                        9
24 456   2016-01-15            -830 TRUE                        9

当前尝试

df <- original %>%
    group_by(ID)%>%  arrange(ID,Date)%>%
    mutate(days_until_next = abs(difftime(Date,lead(Date,1),units="days")))%>%
  mutate(EpisodeTimeCriterian= days_until_next <=30 | is.na(days_until_next))

runs <-rle(df$EpisodeTimeCriterian)$lengths
df$Episode <- rep(1:length(runs),runs)

df %>%
  group_by(ID) %>%
  mutate(
    Episode2 = {
      r <- rle(EpisodeTimeCriterian)
      r$values <- cumsum(rep(1, length(r$values)))
      inverse.rle(r)
    }
  ) %>%
  print(n=66)

数据

df <- structure(list(ID = c("123", "123", "123", "123", "123", "123", 
"123", "123", "123", "123", "123", "123", "123", "123", "123", 
"123", "123", "123", "123", "123", "123", "123", "123", "123", 
"123", "123", "123", "123", "123", "123", "123", "123", "123", 
"123", "123", "123", "123", "123", "123", "123", "123", "123", 
"456", "456", "456", "456", "456", "456", "456", "456", "456", 
"456", "456", "456", "456", "456", "456", "456", "456", "456", 
"456", "456", "456", "456", "456", "456"), Date = structure(c(15986, 
15993, 16000, 16007, 16014, 16021, 16028, 16035, 16042, 16056, 
16066, 16077, 16084, 16091, 16093, 16094, 16098, 16105, 16106, 
16133, 18130, 18137, 18139, 18144, 18151, 18164, 18176, 18190, 
18197, 18204, 18211, 18218, 18225, 18232, 18239, 18246, 18253, 
18254, 18267, 18274, 18281, 18288, 15985, 15992, 16111, 16331, 
16335, 16338, 16345, 16351, 16359, 16456, 16471, 16492, 16533, 
16590, 16602, 16604, 16653, 16667, 16709, 16721, 16737, 16749, 
16814, 16815), class = "Date"), days_until_next = c(7, 7, 7, 
7, 7, 7, 7, 7, 14, 10, 11, 7, 7, 2, 1, 4, 7, 1, 27, 1997, 7, 
2, 5, 7, 13, 12, 14, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 13, 7, 7, 
7, -2302, 7, 119, 220, 4, 3, 7, 6, 8, 97, 15, 21, 41, 57, 12, 
2, 49, 14, 42, 12, 16, 12, 65, 1, -830), EpisodeTimeCriterian = c(TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, 
FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, 
FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, 
TRUE, TRUE)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-66L))

数据(已更新,仅 ID=456)

df %>%
structure(list(ID = c("456", "456", "456", "456", "456", "456", 
"456", "456", "456", "456", "456", "456", "456", "456", "456", 
"456", "456", "456", "456", "456", "456", "456", "456", "456"
), Date = structure(c(15985, 15992, 16111, 16331, 16335, 16338, 
16345, 16351, 16359, 16456, 16471, 16492, 16533, 16590, 16602, 
16604, 16653, 16667, 16709, 16721, 16737, 16749, 16814, 16815
), class = "Date"), days_until_next = c(7, 119, 220, 4, 3, 7, 
6, 8, 97, 15, 21, 41, 57, 12, 2, 49, 14, 42, 12, 16, 12, 65, 
1, -830), EpisodeTimeCriterian = c(TRUE, FALSE, FALSE, TRUE, 
TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, 
TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE
), expected = c(1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 6, 6, 
6, 7, 7, 8, 8, 8, 8, 9, 9)), row.names = c(NA, -24L), class = c("tbl_df", 
"tbl", "data.frame"))

【问题讨论】:

    标签: r date sequence


    【解决方案1】:

    这可能是一种方法。我在从初始示例创建的Episode 旁边添加了带有group_by 的较新输出作为Episode2。希望这会有所帮助。

    library(tidyverse)
    
    df %>%
      group_by(ID) %>%
      mutate(
        Episode2 = {
          r <- rle(EpisodeTimeCriterian)
          r$values <- cumsum(rep(1, length(r$values)))
          inverse.rle(r)
        }
      ) %>%
      print(n=66)
    

    输出

    # A tibble: 66 x 6
    # Groups:   ID [2]
       ID    Date       days_until_next EpisodeTimeCriterian Episode Episode2
       <chr> <date>               <dbl> <lgl>                  <int>    <dbl>
     1 123   2013-10-08               7 TRUE                       1        1
     2 123   2013-10-15               7 TRUE                       1        1
     3 123   2013-10-22               7 TRUE                       1        1
     4 123   2013-10-29               7 TRUE                       1        1
     5 123   2013-11-05               7 TRUE                       1        1
     6 123   2013-11-12               7 TRUE                       1        1
     7 123   2013-11-19               7 TRUE                       1        1
     8 123   2013-11-26               7 TRUE                       1        1
     9 123   2013-12-03              14 TRUE                       1        1
    10 123   2013-12-17              10 TRUE                       1        1
    11 123   2013-12-27              11 TRUE                       1        1
    12 123   2014-01-07               7 TRUE                       1        1
    13 123   2014-01-14               7 TRUE                       1        1
    14 123   2014-01-21               2 TRUE                       1        1
    15 123   2014-01-23               1 TRUE                       1        1
    16 123   2014-01-24               4 TRUE                       1        1
    17 123   2014-01-28               7 TRUE                       1        1
    18 123   2014-02-04               1 TRUE                       1        1
    19 123   2014-02-05              27 TRUE                       1        1
    20 123   2014-03-04            1997 FALSE                      2        2
    21 123   2019-08-22               7 TRUE                       3        3
    22 123   2019-08-29               2 TRUE                       3        3
    23 123   2019-08-31               5 TRUE                       3        3
    24 123   2019-09-05               7 TRUE                       3        3
    25 123   2019-09-12              13 TRUE                       3        3
    26 123   2019-09-25              12 TRUE                       3        3
    27 123   2019-10-07              14 TRUE                       3        3
    28 123   2019-10-21               7 TRUE                       3        3
    29 123   2019-10-28               7 TRUE                       3        3
    30 123   2019-11-04               7 TRUE                       3        3
    31 123   2019-11-11               7 TRUE                       3        3
    32 123   2019-11-18               7 TRUE                       3        3
    33 123   2019-11-25               7 TRUE                       3        3
    34 123   2019-12-02               7 TRUE                       3        3
    35 123   2019-12-09               7 TRUE                       3        3
    36 123   2019-12-16               7 TRUE                       3        3
    37 123   2019-12-23               1 TRUE                       3        3
    38 123   2019-12-24              13 TRUE                       3        3
    39 123   2020-01-06               7 TRUE                       3        3
    40 123   2020-01-13               7 TRUE                       3        3
    41 123   2020-01-20               7 TRUE                       3        3
    42 123   2020-01-27           -2302 TRUE                       3        3
    43 456   2013-10-07               7 TRUE                       3        1
    44 456   2013-10-14             119 FALSE                      4        2
    45 456   2014-02-10             220 FALSE                      4        2
    46 456   2014-09-18               4 TRUE                       5        3
    47 456   2014-09-22               3 TRUE                       5        3
    48 456   2014-09-25               7 TRUE                       5        3
    49 456   2014-10-02               6 TRUE                       5        3
    50 456   2014-10-08               8 TRUE                       5        3
    51 456   2014-10-16              97 FALSE                      6        4
    52 456   2015-01-21              15 TRUE                       7        5
    53 456   2015-02-05              21 TRUE                       7        5
    54 456   2015-02-26              41 FALSE                      8        6
    55 456   2015-04-08              57 FALSE                      8        6
    56 456   2015-06-04              12 TRUE                       9        7
    57 456   2015-06-16               2 TRUE                       9        7
    58 456   2015-06-18              49 FALSE                     10        8
    59 456   2015-08-06              14 TRUE                      11        9
    60 456   2015-08-20              42 FALSE                     12       10
    61 456   2015-10-01              12 TRUE                      13       11
    62 456   2015-10-13              16 TRUE                      13       11
    63 456   2015-10-29              12 TRUE                      13       11
    64 456   2015-11-10              65 FALSE                     14       12
    65 456   2016-01-14               1 TRUE                      15       13
    66 456   2016-01-15            -830 TRUE                      15       13
    

    编辑(2020 年 3 月 2 日)

    我认为如果规则是>= 30 天的日期差开始新一集可能比以前的方法更容易。看看这是否适合你:

    library(tidyverse)
    
    df %>%
      group_by(ID) %>%
      mutate(difftime = Date - lag(Date, default = first(Date)),
             expected2 = cumsum(difftime >= 30) + 1) %>%
      print(n=24)
    

    输出

    # A tibble: 24 x 7
    # Groups:   ID [1]
       ID    Date       days_until_next EpisodeTimeCrit~ expected difftime expected2
       <chr> <date>               <dbl> <lgl>               <dbl> <time>       <dbl>
     1 456   2013-10-07               7 TRUE                    1   0 days         1
     2 456   2013-10-14             119 FALSE                   1   7 days         1
     3 456   2014-02-10             220 FALSE                   2 119 days         2
     4 456   2014-09-18               4 TRUE                    3 220 days         3
     5 456   2014-09-22               3 TRUE                    3   4 days         3
     6 456   2014-09-25               7 TRUE                    3   3 days         3
     7 456   2014-10-02               6 TRUE                    3   7 days         3
     8 456   2014-10-08               8 TRUE                    3   6 days         3
     9 456   2014-10-16              97 FALSE                   3   8 days         3
    10 456   2015-01-21              15 TRUE                    4  97 days         4
    11 456   2015-02-05              21 TRUE                    4  15 days         4
    12 456   2015-02-26              41 FALSE                   4  21 days         4
    13 456   2015-04-08              57 FALSE                   5  41 days         5
    14 456   2015-06-04              12 TRUE                    6  57 days         6
    15 456   2015-06-16               2 TRUE                    6  12 days         6
    16 456   2015-06-18              49 FALSE                   6   2 days         6
    17 456   2015-08-06              14 TRUE                    7  49 days         7
    18 456   2015-08-20              42 FALSE                   7  14 days         7
    19 456   2015-10-01              12 TRUE                    8  42 days         8
    20 456   2015-10-13              16 TRUE                    8  12 days         8
    21 456   2015-10-29              12 TRUE                    8  16 days         8
    22 456   2015-11-10              65 FALSE                   8  12 days         8
    23 456   2016-01-14               1 TRUE                    9  65 days         9
    24 456   2016-01-15            -830 TRUE                    9   1 days         9
    

    【讨论】:

    • 太棒了!谢谢!然而,它发现了我做错的其他事情。一集中的最后一个日期将被推到下一集...例如,第 43 和 44 行应该是第 1 集,然后第 2 集应该从第 45 行开始。有什么建议可以解决这个问题吗?
    • 你能再描述一下规则/逻辑吗?那是因为第 44 行和第 45 行连续有 2 个 FALSE 吗?这也适用于第 54 行和第 55 行吗?您愿意用“预期的”episode 列值更新您的问题吗?我可以尝试修改答案以匹配您想要的结果。
    • 我更新了预期值和新数据。也许之间的时间不是确定起点/终点的最佳方式,但我想不出另一种方式。
    • @akano 查看已编辑的答案。如果规则基于 >= 30 天的差异,则可以简化方法。看看这是否满足您的需求。
    • 我确实过于复杂了。感谢您的帮助!
    猜你喜欢
    • 2018-07-05
    • 2021-12-26
    • 2017-11-03
    • 2020-04-03
    • 2021-11-18
    • 2019-12-06
    • 2018-02-03
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多