【问题标题】:Identify consecutive duplicates in R using accumulate使用累积识别 R 中的连续重复项
【发布时间】:2021-09-07 08:04:57
【问题描述】:

让我分享一个我正在尝试做的例子,因为标题可能不像我希望的那样清晰。

data <- tibble(week=1:10,name=c(rep("Joe",10)),value=c(.9,.89,.99,.98,.87,.89,.93,.92,.98,.9),
               wanted = c("Yes","Skip","No","No","Yes","Skip","Yes","Skip","No","Yes"))

data <- data %>% mutate(my_attempt = case_when( week-lag(week)==1 & 
                                          value < .95 &
                                          lag(value) < .95 &
                                          lag(value,2) >= .95 &
                                          !is.na(lag(value,2))~ "Skip",
                                        week-lag(week)==1 & 
                                          value < .95 &
                                          lag(value) < .95 &
                                          is.na(lag(value,2))~ "Skip",
                                        value < .95 ~"Yes",
                                        TRUE ~ "No"))    

 #   week name  value wanted my_attempt
 #  <int> <chr> <dbl> <chr>  <chr>     
 #     1 Joe    0.9  Yes    Yes       
 #     2 Joe    0.89 Skip   Skip      
 #     3 Joe    0.99 No     No        
 #     4 Joe    0.98 No     No        
 #     5 Joe    0.87 Yes    Yes       
 #     6 Joe    0.89 Skip   Skip      
 #     7 Joe    0.93 Yes    Yes       
 #     8 Joe    0.92 Skip   Yes       
 #     9 Joe    0.98 No     No        
 #    10 Joe    0.9  Yes    Yes    

我正在尝试让 my_attempt 列生成所需列的结果。我想在值小于某个阈值时识别行,但不能有两个连续的“是”值。我的尝试一直有效,直到它连续看到 4 个或更多低值。在我的真实数据中,可能会缺少几周,但这可以被视为“否”。例如,如果缺少第 6 周,则第 7 周仍然可以选择“是”(我认为在我的情况下第一行会处理这个问题)。有没有办法在 R 中做到这一点?它不必与 dplyr 保持一致,但如果在 tidyverse 中可行,那就太好了。

【问题讨论】:

    标签: r dplyr duplicates rolling-computation accumulate


    【解决方案1】:

    这是一个简单的dplyr 解决方案:

    library(dplyr)
    
    data %>%
      mutate(grp = cummax(week - lag(week, default = 0))) %>%
      group_by(name, grp) %>%
      mutate(my_attempt = ifelse(value < 0.95 & lag(value, default = 1) < 0.95, "Skip", 
                                 ifelse(value < 0.95 & lag(value, default = 1) >= 0.95, 
                                        "Yes", "No")))
    
    # A tibble: 9 x 6
    # Groups:   name, grp [2]
       week name  value wanted   grp my_attempt
      <int> <chr> <dbl> <chr>  <dbl> <chr>     
    1     1 Joe    0.9  Yes        1 Yes       
    2     2 Joe    0.89 Skip       1 Skip      
    3     3 Joe    0.99 No         1 No        
    4     4 Joe    0.98 No         1 No        
    5     5 Joe    0.87 Yes        1 Yes       
    6     7 Joe    0.93 Yes        2 Yes       
    7     8 Joe    0.92 Skip       2 Skip      
    8     9 Joe    0.98 No         2 No        
    9    10 Joe    0.9  Yes        2 Yes 
    

    以下是在缺少周值的数据集上使用 base::Reduce 的方法。我首先根据周值之间的差异创建了一个分组grp,然后根据分组变量创建了split 数据集。之后我将我们的函数应用于每个块并将结果与​​rbind 绑定:

    do.call(rbind, lapply(split(data, cummax(abs(data$week - c(0, data$week[-nrow(data)]))), data$name), 
                          \(x){
                            x$my_attept <- Reduce(function(a, b) {
                              if(x$value[b] < 0.95 & a != "Yes") {
                                "Yes"
                              } else if(x$value[b] < 0.95 & a == "Yes") {
                                "Skip"
                              } else {
                                "No"
                              }
                            }, 2:nrow(x), init = ifelse(x$value[1] < 0.95, "Yes", "No"), accumulate = TRUE)
                            x
                          }))
    
    # A tibble: 9 x 5
       week name  value wanted my_attept
    * <int> <chr> <dbl> <chr>  <chr>    
    1     1 Joe    0.9  Yes    Yes      
    2     2 Joe    0.89 Skip   Skip     
    3     3 Joe    0.99 No     No       
    4     4 Joe    0.98 No     No       
    5     5 Joe    0.87 Yes    Yes      
    6     7 Joe    0.93 Yes    Yes      
    7     8 Joe    0.92 Skip   Skip     
    8     9 Joe    0.98 No     No       
    9    10 Joe    0.9  Yes    Yes 
    

    如果您的数据中缺少数周,例如此处修改的数据集,您可以使用以下解决方案。我们首先根据它们的连续值对周进行分组,然后将我们的解决方案应用于每个组:

    data %>%
      mutate(grp = cummax(week - lag(week, default = 0))) %>%
      group_by(name, grp) %>%
      mutate(my_attept = accumulate(value[-1], .init = ifelse(value[1] < 0.95, "Yes", "No"),
                                    ~ if(.y < 0.95 & .x != "Yes") {
                                      "Yes"
                                    } else if(.y < 0.95 & .x == "Yes") {
                                      "Skip"
                                    } else {
                                      "No"
                                    }))
    
    # A tibble: 9 x 6
    # Groups:   grp [2]
       week name  value wanted   grp my_attept
      <int> <chr> <dbl> <chr>  <dbl> <chr>    
    1     1 Joe    0.9  Yes        1 Yes      
    2     2 Joe    0.89 Skip       1 Skip     
    3     3 Joe    0.99 No         1 No       
    4     4 Joe    0.98 No         1 No       
    5     5 Joe    0.87 Yes        1 Yes      
    6     7 Joe    0.93 Yes        2 Yes      
    7     8 Joe    0.92 Skip       2 Skip     
    8     9 Joe    0.98 No         2 No       
    9    10 Joe    0.9  Yes        2 Yes 
    

    数据

    structure(list(week = c(1L, 2L, 3L, 4L, 5L, 7L, 8L, 9L, 10L), 
        name = c("Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", 
        "Joe", "Joe"), value = c(0.9, 0.89, 0.99, 0.98, 0.87, 0.93, 
        0.92, 0.98, 0.9), wanted = c("Yes", "Skip", "No", "No", "Yes", 
        "Yes", "Skip", "No", "Yes")), row.names = c(NA, -9L), class = c("tbl_df", 
    "tbl", "data.frame"))
    

    【讨论】:

    • 包括name在group_by也是亲爱的朋友
    【解决方案2】:

    我会使用像slider 这样的滚动计算库来完成它,其中丢失的数据可以很好地被索引。向您展示修改后的数据

    library(tidyverse)
    data <- tibble(week=c(1:5, 7:10),name=c(rep("Joe",9)),value=c(.9,.89,.99,.98,.87,.93,.92,.98,.9),
                   wanted = c("Yes","Skip","No","No","Yes","Yes","Skip","No","Yes"))
    
    data
    #> # A tibble: 9 x 4
    #>    week name  value wanted
    #>   <int> <chr> <dbl> <chr> 
    #> 1     1 Joe    0.9  Yes   
    #> 2     2 Joe    0.89 Skip  
    #> 3     3 Joe    0.99 No    
    #> 4     4 Joe    0.98 No    
    #> 5     5 Joe    0.87 Yes   
    #> 6     7 Joe    0.93 Yes   
    #> 7     8 Joe    0.92 Skip  
    #> 8     9 Joe    0.98 No    
    #> 9    10 Joe    0.9  Yes
    library(slider)
    
    data %>% group_by(name) %>%
      mutate(wanted2 = case_when(value < 0.95 & slide_index_lgl(.x = value, 
                                                                .i = week, 
                                                                .f = ~  any(.x < 0.95), 
                                                                .before = 1, 
                                                                .after = -1) ~ 'skip',
                                 value < 0.95 ~ 'yes',
                                 TRUE ~ 'no'))
    #> # A tibble: 9 x 5
    #> # Groups:   name [1]
    #>    week name  value wanted wanted2
    #>   <int> <chr> <dbl> <chr>  <chr>  
    #> 1     1 Joe    0.9  Yes    yes    
    #> 2     2 Joe    0.89 Skip   skip   
    #> 3     3 Joe    0.99 No     no     
    #> 4     4 Joe    0.98 No     no     
    #> 5     5 Joe    0.87 Yes    yes    
    #> 6     7 Joe    0.93 Yes    yes    
    #> 7     8 Joe    0.92 Skip   skip   
    #> 8     9 Joe    0.98 No     no     
    #> 9    10 Joe    0.9  Yes    yes
    

    即使不使用slider 也可以完成,即仅在dplyr

    library(dplyr)
    data %>% group_by(name) %>%
      mutate(wanted2 = case_when(value < 0.95 & lag(value, default = 1) < 0.95 & week - 1 == lag(week, default = 0) ~ 'Skip',
                                 value < 0.95 ~ 'Yes',
                                 TRUE ~ 'No'))
    
    #> # A tibble: 9 x 5
    #> # Groups:   name [1]
    #>    week name  value wanted wanted2
    #>   <int> <chr> <dbl> <chr>  <chr>  
    #> 1     1 Joe    0.9  Yes    Yes    
    #> 2     2 Joe    0.89 Skip   Skip   
    #> 3     3 Joe    0.99 No     No     
    #> 4     4 Joe    0.98 No     No     
    #> 5     5 Joe    0.87 Yes    Yes    
    #> 6     7 Joe    0.93 Yes    Yes    
    #> 7     8 Joe    0.92 Skip   Skip   
    #> 8     9 Joe    0.98 No     No     
    #> 9    10 Joe    0.9  Yes    Yes
    

    reprex package (v2.0.0) 于 2021 年 7 月 25 日创建

    【讨论】:

      【解决方案3】:

      我想你可以在这里使用purrr:accumulate()

      library(purrr)
      library(dplyr)
      
      data%>%mutate(my_attempt = ifelse(week-lag(week, default = 0)==1 & 
                                                value < .95,
                                        'Yes', 'No')%>%
              accumulate(~ifelse(.x==.y & .y=='Yes', 'Skip', .y)))
      
      # A tibble: 10 x 5
          week name  value wanted my_attempt
         <int> <chr> <dbl> <chr>  <chr>     
       1     1 Joe    0.9  Yes    Yes       
       2     2 Joe    0.89 Skip   Skip      
       3     3 Joe    0.99 No     No        
       4     4 Joe    0.98 No     No        
       5     5 Joe    0.87 Yes    Yes       
       6     6 Joe    0.89 Skip   Skip      
       7     7 Joe    0.93 Yes    Yes       
       8     8 Joe    0.92 Skip   Skip      
       9     9 Joe    0.98 No     No        
      10    10 Joe    0.9  Yes    Yes 
      

      【讨论】:

      • 这里优雅地使用accumulate
      • 向我们的朋友@AnilGoyal 学习
      • 这个问题让我有点困惑,操作应该给出更多关于如何计算所需输出的描述。
      • 效果很好!谢谢@AnoushiravanR。 @GuedesBF,如果 value 在示例中低于 0.95,我希望 my_attempt 返回“是”。但是,如果前一周已经有“是”,我希望它改为返回“跳过”
      • 我添加了一个基本的 R 解决方案和一个不太不同的 tidyverse 替代方案。
      猜你喜欢
      • 2019-02-05
      • 2019-02-24
      • 2018-11-26
      • 2014-05-11
      • 1970-01-01
      • 1970-01-01
      • 2019-11-19
      • 2011-08-23
      • 1970-01-01
      相关资源
      最近更新 更多