【问题标题】:Using dplyr: Within groups, select the first value meeting a condition使用 dplyr:在组内,选择第一个满足条件的值
【发布时间】:2020-01-16 22:00:58
【问题描述】:

我需要帮助来获得一个解决方案,该解决方案将及时向后扫描并获得满足条件的第一个值。我的数据类似于:

set.seed(42)

df <- data.frame(
  id = sample(LETTERS[1:3], 20, replace = TRUE),
  time.var = sample(1:20, 20, replace = TRUE),
  x = sample(c(1:10), 20, replace = TRUE)
  )

df <- df[order(df$id, df$time.var),]

 id time.var  x
  A        5  2
  A       14  8
  A       19  7
  A       20  1
  B        1  1
  B        2  5
  B        9 10
  B       11 10
  B       13  6
  B       15  4
  B       19  3
  C        1  7
  C        3  5
  C        8  9
  C        8  4
  C       17  7
  C       17  4
  C       17  8
  C       19  4
  C       19 10

对于time.var按时间顺序定义的每个组的最后一个成员,我想通过按时间降序扫描从x获得小于5的第一个值。

我试过了:

test <- df %>% 
        group_by(id) %>% 
        arrange(id, time.var) %>% 
        mutate(less.5 = which.max(x[x < 5]) )

我可以使用什么策略来获得这种类型的输出:

 id time.var  x  previous.less.5
  A        5  2
  A       14  8
  A       19  7
  A       20  1      2
  B        1  1
  B        2  5
  B        9 10
  B       11 10
  B       13  6
  B       15  4
  B       19  3      4
  C        1  7
  C        3  5
  C        8  9
  C        8  4
  C       17  7
  C       17  4
  C       17  8
  C       19  4
  C       19 10      4

【问题讨论】:

    标签: r dataframe dplyr


    【解决方案1】:

    使用library(dplyr):

    df %>% 
      arrange(id, time.var) %>% 
      group_by(id) %>% 
      mutate(previous.less.5 = tail(c(x[c((x[-n()] < 5), FALSE)]),1)) %>% 
      group_by(id) %>% 
      mutate(previous.less.5 = if_else(row_number() == n(), previous.less.5, NULL))
    

    df %>%
      arrange(id, time.var) %>% 
      group_by(id) %>%   
      slice(1:(n()-1)) %>% 
      filter(x < 5) %>% 
      slice(n()) %>% 
      select(-time.var) %>% 
      right_join(df, ., by="id", suffix =c("",".y")) %>% 
      group_by(id) %>% 
      mutate(previous.less.5 = if_else(row_number() == n(), x.y, NULL)) %>%
      select(-x.y)
    

    给予:

    #> # A tibble: 20 x 4
    #> # Groups:   id [3]
    #>    id    time.var     x previous.less.5
    #>    <fct>    <int> <int>           <int>
    #>  1 A            3    10              NA
    #>  2 A            4     8              NA
    #>  3 A            4     6              NA
    #>  4 A            5     2              NA
    #>  5 A            5     8              NA
    #>  6 A            5     7              NA
    #>  7 A           11     6              NA
    #>  8 A           13     3              NA
    #>  9 A           15     2               3
    #> 10 B            2     1              NA
    #> 11 B            4     3              NA
    #> 12 B            4     6              NA
    #> 13 B            8     5              NA
    #> 14 B            8     4              NA
    #> 15 B           20     7               4
    #> 16 C            1     2              NA
    #> 17 C            2    10              NA
    #> 18 C           10     6              NA
    #> 19 C           13     2              NA
    #> 20 C           18     5               2
    

    更新:

    如果有一个组没有记录少于 5(或只有最后一条记录少于 5),则以下工作:

    df %>% 
      arrange(id, time.var) %>% 
      group_by(id) %>% 
      mutate(previous.less.5 = if_else(row_number() == n(), 
                                       max(tail(c( x[ c( x[-n()] < 5, FALSE) ] ), 1)), 
                                       NULL)) %>% 
      mutate(previous.less.5 = replace(previous.less.5, is.infinite(previous.less.5), NA))
    

    数据:

    set.seed(42) # I am getting different data than what you've shown with this seed
    
    df <- data.frame(
      id = sample(LETTERS[1:3], 20, replace = TRUE),
      time.var = sample(1:20, 20, replace = TRUE),
      x = sample(c(1:10), 20, replace = TRUE)
    )
    
    df <- df[order(df$id, df$time.var),]
    

    【讨论】:

    • 简单一点:df %&gt;% arrange(id, time.var) %&gt;% group_by(id) %&gt;% mutate(previous.less.5 = if_else(row_number() == n(), tail(sort(x[x &lt; 5]), 1), NA_integer_)).
    • @H1 人们会争论简单的概念。在我看来,tail(c(x[c((x[-n()] &lt; 5), FALSE)]),1) 本身需要一些思考才能理解,因此将其与if_else 结合起来会使其复杂一些。但是您的版本可能更快。干杯。
    • 如果组没有值 NA 可能是唯一存在的其他值的条件?
    【解决方案2】:

    我们可以通过id 反转x 的值,使用which 得到小于5 的第一个数字。最后一个replace是将NA赋值给previous.less.5中除最后一个之外的所有值。

    library(dplyr)
    
    df %>%
      #Data is already sorted by `id` and `time.var` but if your still need use
      #arrange(id, time.var) %>%
      group_by(id) %>%
      mutate(rev_x = c(NA, rev(x)[-1]), previous.less.5 = rev_x[which(rev_x < 5)[1]], 
             previous.less.5 = replace(previous.less.5, row_number() != n(), NA)) %>%
      select(-rev_x)
    
    #   id    time.var     x previous.less.5
    #   <fct>    <int> <int>           <int>
    # 1 A            5     2              NA
    # 2 A           14     8              NA
    # 3 A           19     7              NA
    # 4 A           20     1               2
    # 5 B            1     1              NA
    # 6 B            2     5              NA
    # 7 B            9    10              NA
    # 8 B           11    10              NA
    # 9 B           13     6              NA
    #10 B           15     4              NA
    #11 B           19     3               4
    #12 C            1     7              NA
    #13 C            3     5              NA
    #14 C            8     9              NA
    #15 C            8     4              NA
    #16 C           17     7              NA
    #17 C           17     4              NA
    #18 C           17     8              NA
    #19 C           19     4              NA
    #20 C           19    10               4
    

    如果id 中没有小于5 的值,这也应该处理这种情况并返回NA

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2015-07-09
      • 2013-04-08
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多