【问题标题】:Search many variables with a vector of patterns in R/tidyverse在 R/tidyverse 中使用模式向量搜索许多变量
【发布时间】:2022-01-07 22:46:42
【问题描述】:

我想在我的数据框 (data_df) 中搜索许多变量 (key1key30) 中的任何模式(存储在矢量“my_patterns”)。 对于每个观察,结果将存储在 30 个虚拟变量/列(key1_matchkey30_match)中,1 表示“keyX”变量具有与“my_patterns”向量中的值之一匹配,而 0 不匹配。对于特定的观察。我只需要知道有一个匹配项,而不是哪个匹配项。

如何在 R 中做到这一点,最好使用 tidyverse 函数?

my_patterns <- c("AF021", "DT022", "DV053", "UJC12", "UJD02", "UJD05", "AF012", "AG053", "JAH01", "JCA55", "QBB99")
data_df <- structure(list(id = c(1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1317, 11832, 
1943, 1316, 8317, 13405, 12881, 12881, 12881, 12881, 12881, 12881, 
12882, 12882, 12882, 12882, 12883, 12883, 12883), key1 = c("", 
"", "", "", "DR029", "", "AF063", "UJD05", "JCF12", "", "AF021", 
"DT022", "XS912", "UJC12", "UJD05", "JAH00", "UJD02", "DT016", 
"DT016", "", "DV071", "DR029", "2154", "", "AJ079", "XV018", 
"7462", "7460", "LEG10"), key2 = c(NA, NA, NA, NA, NA, NA, NA, 
NA, "JFF00", NA, "AF021", "DT022", "XS912", "UJC12", "UJD05", 
"JAH00", "UJD05", "DT017", "DT017", NA, "DV022", "JDB10", NA, 
NA, "AJ080", NA, NA, "7461", "LCA06"), key3 = c(NA, NA, NA, NA, 
NA, NA, NA, NA, "UJD02", NA, "AF021", "DT022", "ZV033", "UJC12", 
"UJD05", "JAH00", "AF012", "DT019", "DT019", NA, "DV079", NA, 
NA, NA, "DR029", NA, NA, "7469", NA), key4 = c(NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, "AF021", "DT022", "DV071", "UJC12", "UJD05", 
"JAH00", "AG053", NA, "DT024", NA, "DV027", NA, NA, NA, "DT016", 
NA, NA, "7280", NA), key5 = c(NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, "AF021", "DT022", "DV071", "UJC12", "UJD05", "JKB30", 
"JAH01", NA, NA, NA, "DV064", NA, NA, NA, "UJD02", NA, NA, NA, 
NA), key6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "UJD02", 
"DT022", "DV071", "UJC12", "UJD05", "JKB30", "JCA55", NA, NA, 
NA, "DV040", NA, NA, NA, NA, NA, NA, NA, NA), key7 = c(NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, "UJD02", "DT022", "DV071", "UJD05", 
"JCA55", "JKB30", "UJD02", NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA), key8 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
"UJD02", "DV051", "DV071", "UJD05", "JCA55", "JKB30", NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key9 = c(NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, "UJD02", "DV053", "DV071", "UJD05", 
"JCA55", "JFK10", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA), key10 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "UJD02", 
"DV055", "DV071", "UJD05", "TPW99", "JFK10", NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA), key11 = c(NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, "AF053", "DV057", "DV071", "UJD05", "TPW99", 
"JFK10", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
    key12 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AF053", 
    "DV057", "DV071", "UJD05", "TPW99", "JFK10", NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key13 = c(NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, "AF053", "DV057", "DV071", 
    "JCA55", "AJ050", "JFB40", NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), key14 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, "AF053", "DV057", "DV071", "JCA55", "AJ050", 
    "JFB40", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA), key15 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AF053", 
    "DV057", "DV071", "JCA55", "AJ050", "JFB40", NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key16 = c(NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, "AG009", "DV057", "DV071", 
    "JCA55", "AG040", "JFB40", NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), key17 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, "AG009", "DV057", "DV071", "JCA55", "AG040", 
    "JFF23", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA), key18 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AG009", 
    "DV057", "DV071", "JCA55", "AG040", "JFF23", NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key19 = c(NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, "AG009", "DV057", "DV071", 
    "JCA55", "XS009", "JFF23", NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), key20 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, "AG009", "DV057", "DV071", "JCA55", "XS009", 
    "JFF23", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA), key21 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AJ004", 
    "DV057", "DT016", "JCA55", "XS009", "JWA00", NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key22 = c(NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, "AJ004", "DV057", "DV071", 
    "JCA55", "UJD05", "JWA00", NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), key23 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, "AJ004", "DV057", "XS918", "JCA55", "UJD05", 
    "JWA00", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA), key24 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AJ004", 
    "DV057", "DV071", "JCA55", "JCA55", "JWA00", NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key25 = c(NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, "AJ004", "DV057", "DV071", 
    "JCA55", "TPW99", "QBB99", NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), key26 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, "DV057", "DV071", "JCA55", "AJ050", "QBB99", 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key27 = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "DV057", "DV071", 
    "JCA55", "AG040", "QBB99", NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), key28 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, "DV057", "DV071", "JCA55", "XS009", "QBB99", 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key29 = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "DV057", "DV071", 
    "JCA55", NA, "QBB99", NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA), key30 = c(NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, "DV057", "DV071", "JCA55", NA, "QBB99", NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11814L, 106482L, 17355L, 
11807L, 74026L, 120903L, 116030L, 116031L, 116032L, 116033L, 
116034L, 116035L, 116036L, 116037L, 116038L, 116039L, 116040L, 
116041L, 116042L), class = "data.frame")

【问题讨论】:

    标签: r dplyr tidyverse stringr


    【解决方案1】:
    library(tidyverse)
    
    my_patterns <- c("AF021", "DT022", "DV053", "UJC12", "UJD02", "UJD05", "AF012", "AG053", "JAH01", "JCA55", "QBB99")
    # any element -> OR regex
    my_regex <- paste0(my_patterns, collapse = "|")
    
    data_df %>%
      as_tibble() %>%
      pivot_longer(-id) %>%
      transmute(
        id,
        name,
        value = value %>% map_dbl(~ .x %>%
          str_detect(my_regex) %>%
          replace_na(0))
      ) %>%
      distinct(id, name, .keep_all = TRUE) %>%
      pivot_wider(names_from = name, values_from = value)
    #> # A tibble: 12 x 31
    #>       id  key1  key2  key3  key4  key5  key6  key7  key8  key9 key10 key11 key12
    #>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
    #>  1     1     0     0     0     0     0     0     0     0     0     0     0     0
    #>  2     2     0     0     0     0     0     0     0     0     0     0     0     0
    #>  3     3     0     0     0     0     0     0     0     0     0     0     0     0
    #>  4  1317     1     1     1     1     1     1     1     1     1     1     0     0
    #>  5 11832     1     1     1     1     1     1     1     0     1     0     0     0
    #>  6  1943     0     0     0     0     0     0     0     0     0     0     0     0
    #>  7  1316     1     1     1     1     1     1     1     1     1     1     1     1
    #>  8  8317     1     1     1     1     1     1     1     1     1     0     0     0
    #>  9 13405     0     0     0     0     0     0     0     0     0     0     0     0
    #> 10 12881     1     1     1     1     1     1     1     0     0     0     0     0
    #> 11 12882     0     0     0     0     0     0     0     0     0     0     0     0
    #> 12 12883     0     0     0     0     0     0     0     0     0     0     0     0
    #> # … with 18 more variables: key13 <dbl>, key14 <dbl>, key15 <dbl>, key16 <dbl>,
    #> #   key17 <dbl>, key18 <dbl>, key19 <dbl>, key20 <dbl>, key21 <dbl>,
    #> #   key22 <dbl>, key23 <dbl>, key24 <dbl>, key25 <dbl>, key26 <dbl>,
    #> #   key27 <dbl>, key28 <dbl>, key29 <dbl>, key30 <dbl>
    

    reprex package 创建于 2021-12-01 (v2.0.1)

    【讨论】:

      【解决方案2】:

      使用 dplyr,我们可以将 mutatestarts_with 'key' 的所有列一起使用。 value %in% my_pattern 如果任何模式与该值匹配,则输出 TRUE/FALSE。我们可以使用+(...) 强制转换为数字。最后,summarise by id 和max

      library(dplyr)
      
      data_df %>%
          mutate(across(starts_with('key'), ~ +(.x %in% my_patterns))) %>%
          group_by(id)%>%
          summarise(across(starts_with('key'), max))
      
      
      # A tibble: 12 × 31
            id  key1  key2  key3  key4  key5  key6  key7  key8  key9 key10 key11 key12 key13 key14 key15 key16 key17 key18 key19 key20 key21 key22
         <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
       1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
       2     2     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
       3     3     1     0     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
       4  1316     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1
       5  1317     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0
       6  1943     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
       7  8317     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     1
       8 11832     1     1     1     1     1     1     1     0     1     0     0     0     0     0     0     0     0     0     0     0     0     0
       9 12881     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
      10 12882     0     0     0     0     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
      11 12883     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
      12 13405     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
      # … with 8 more variables: key23 <int>, key24 <int>, key25 <int>, key26 <int>, key27 <int>, key28 <int>, key29 <int>, key30 <int>
      

      或者我们可以通过在summarise(across() 中包含所有转换来进一步简化它:

      library(dplyr)
      
      data_df %>%
          group_by(id) %>%
          summarise(across(starts_with('key'), ~+(any(.x %in% my_patterns))))
      

      【讨论】:

        【解决方案3】:

        另一种解决方案:

        library(tidyr)
        
        data_df %>% 
          pivot_wider(id,values_from = -id, values_fn=function(x) sum(x %in% my_patterns))
        
        #> # A tibble: 12 × 31
        #>       id key1_ key2_ key3_ key4_ key5_ key6_ key7_ key8_ key9_ key10_ key11_
        #>    <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int>  <int>  <int>
        #>  1     1     0     0     0     0     0     0     0     0     0      0      0
        #>  2     2     0     0     0     0     0     0     0     0     0      0      0
        #>  3     3     1     0     1     0     0     0     0     0     0      0      0
        #>  4  1317     1     1     1     1     1     1     1     1     1      1      0
        #>  5 11832     1     1     1     1     1     1     1     0     1      0      0
        #>  6  1943     0     0     0     0     0     0     0     0     0      0      0
        #>  7  1316     1     1     1     1     1     1     1     1     1      1      1
        #>  8  8317     1     1     1     1     1     1     1     1     1      0      0
        #>  9 13405     0     0     0     0     0     0     0     0     0      0      0
        #> 10 12881     1     1     1     1     1     1     1     0     0      0      0
        #> 11 12882     0     0     0     0     1     0     0     0     0      0      0
        #> 12 12883     0     0     0     0     0     0     0     0     0      0      0
        #> # … with 19 more variables: key12_ <int>, key13_ <int>, key14_ <int>,
        #> #   key15_ <int>, key16_ <int>, key17_ <int>, key18_ <int>, key19_ <int>,
        #> #   key20_ <int>, key21_ <int>, key22_ <int>, key23_ <int>, key24_ <int>,
        #> #   key25_ <int>, key26_ <int>, key27_ <int>, key28_ <int>, key29_ <int>,
        #> #   key30_ <int>
        

        【讨论】:

        • 我尝试时似乎不起作用
        • 感谢您的评论,@GuedesBF。我刚刚试过没有错误。你得到什么错误?
        • 你是对的,@GuedesBF。谢谢!我误读了这个问题。我的答案的编辑版本应该按照 OP 的要求工作。
        猜你喜欢
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 2023-04-06
        • 1970-01-01
        • 2016-07-28
        • 1970-01-01
        • 2023-03-11
        • 1970-01-01
        相关资源
        最近更新 更多