【问题标题】:Find first example of unique values and return row number查找唯一值的第一个示例并返回行号
【发布时间】:2018-11-06 14:05:58
【问题描述】:

我有这个数据框:

df <- structure(list(Name = c("Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
                          "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
                          "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", 
                          "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", 
                          "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2", "Sub2"), 
                 StimulusName = c("Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", 
                                  "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", 
                                  "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", 
                                  "Stim2", "Stim2", "Stim2", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", 
                                  "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim1", "Stim2", 
                                  "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", 
                                  "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2", "Stim2"), 
                 Fixation = c(NA, NA, 1L, 1L, NA, NA, 2L, 2L, 3L, 3L, NA, NA, NA, NA, NA, 4L, 4L, 5L, 5L, NA, NA, NA, NA, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
                              2L, NA, NA, NA, 3L, 3L, 3L, NA, NA, NA, NA, NA, NA, 1L, 1L, 1L, 1L, 2L, 2L, NA, NA, 3L, 3L, 3L, 4L, 4L, 4L, NA, NA, 1L, 1L, NA, 
                              NA, 2L, 2L, 3L, 3L, NA, NA, NA, NA, NA, 4L, 4L, 5L, 5L, NA)), 
            row.names = c(NA, -79L), class = c("tbl_df", "tbl", "data.frame"))

共有 3 列:NameStimulusNameFixation

我希望能够返回Fixation 列中唯一值的第一个 示例的行号,并将它们按NameStimulusName 分组。

这是我迄今为止尝试过的(基于在其他地方找到的部分解决方案):

# function to return rows
Unique_Indices <- function(Values){
  unik <- !duplicated(Values)  ## logical vector of unique values
  return(seq_along(Values)[unik])  ## indices
}

但是当我将它与 dplyr 链一起使用时,它不会返回原始行号,而是通过分组重新开始行计数:

library(tidyr)

# This doesn't work
Unique_Index <- df %>%
  group_by(Name, StimulusName) %>%
  summarise(Indices = list(Unique_Indices(Fixation))) %>%
  unnest()

不正确的输出如下所示:

您可以看到,Indices 移动到下一个 StimulusName 后,由于 group_by 指令,它不包含原始行号。在保留df 的原始行号的同时,我有什么办法可以group_by 吗?

【问题讨论】:

  • 正确的预期结果是什么?
  • 我不确定,但df %&gt;% rownames_to_column() %&gt;% group_by(Name, StimulusName) %&gt;% filter(!duplicated(Fixation)) 是否给出了您预期的输出?
  • 您的数据没有唯一价值
  • 嗨@kath,这似乎奏效了,是的。如果您将解决方案弹出到答案中,我会接受。

标签: r dplyr unique


【解决方案1】:

您可以直接过滤每组 Fixation 的非重复值,并首先将行名转换为适当的列以保留索引。

library(dplyr)
library(tibble)

df %>% 
  rownames_to_column() %>% 
  group_by(Name, StimulusName) %>%
  filter(!duplicated(Fixation))

# A tibble: 21 x 4
# Groups:   Name, StimulusName [4]
#    rowname Name  StimulusName Fixation
#    <chr>   <chr> <chr>           <int>
#  1 1       Sub1  Stim1              NA
#  2 3       Sub1  Stim1               1
#  3 7       Sub1  Stim1               2
#  4 9       Sub1  Stim1               3
#  5 16      Sub1  Stim1               4
#  6 18      Sub1  Stim1               5
#  7 20      Sub1  Stim2              NA
#  8 24      Sub1  Stim2               1
#  9 28      Sub1  Stim2               2
# 10 37      Sub1  Stim2               3
# ... with 11 more rows

根据 Ronak Shah 的建议,dplyr-only 解决方案可能如下所示:

df %>% 
  mutate(Index = row_number()) %>% 
  group_by(Name, StimulusName) %>%
  filter(!duplicated(Fixation))

【讨论】:

    【解决方案2】:

    data.table 中有一个变量.I,它是行号,所以你可以只是子集.I。我的输出中的Indices 应该与@kath 注释中代码输出中的rowname 相同。

    library(data.table)
    setDT(df)
    
    df[, .(Indices = .I[!duplicated(Fixation)])
       , .(Name, StimulusName)]
    
    
    #     Name StimulusName Indices
    #  1: Sub1        Stim1       1
    #  2: Sub1        Stim1       3
    #  3: Sub1        Stim1       7
    #  4: Sub1        Stim1       9
    #  5: Sub1        Stim1      16
    #  6: Sub1        Stim1      18
    #  7: Sub1        Stim2      20
    #  8: Sub1        Stim2      24
    #  9: Sub1        Stim2      28
    # 10: Sub1        Stim2      37
    # 11: Sub2        Stim1      40
    # 12: Sub2        Stim1      46
    # 13: Sub2        Stim1      50
    # 14: Sub2        Stim1      54
    # 15: Sub2        Stim1      57
    # 16: Sub2        Stim2      60
    # 17: Sub2        Stim2      62
    # 18: Sub2        Stim2      66
    # 19: Sub2        Stim2      68
    # 20: Sub2        Stim2      75
    # 21: Sub2        Stim2      77
    #     Name StimulusName Indices
    

    【讨论】:

      【解决方案3】:

      这是base R的选项

      cbind(unique(df)[-3], Fixation = which(!duplicated(df)))
      #   Name StimulusName Fixation
      #1  Sub1        Stim1        1
      #2  Sub1        Stim1        3
      #3  Sub1        Stim1        7
      #4  Sub1        Stim1        9
      #5  Sub1        Stim1       16
      #6  Sub1        Stim1       18
      #7  Sub1        Stim2       20
      #8  Sub1        Stim2       24
      #9  Sub1        Stim2       28
      #10 Sub1        Stim2       37
      #11 Sub2        Stim1       40
      #12 Sub2        Stim1       46
      #13 Sub2        Stim1       50
      #14 Sub2        Stim1       54
      #15 Sub2        Stim1       57
      #16 Sub2        Stim2       60
      #17 Sub2        Stim2       62
      #18 Sub2        Stim2       66
      #19 Sub2        Stim2       68
      #20 Sub2        Stim2       75
      #21 Sub2        Stim2       77
      

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 2017-02-21
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 2020-12-06
        • 1970-01-01
        相关资源
        最近更新 更多